From ba55f96356f16a27733aca5096169cd2fcf5b348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Sun, 24 Nov 2019 11:22:40 +0100 Subject: [PATCH] Remove GEMM benchmarks for the moment --- .gitmodules | 3 - benchmarks/cpuinfo | 1 - benchmarks/cpuinfo.h | 1693 ----------------- benchmarks/cpuinfo.nim | 371 ---- benchmarks/matmul/README.md | 38 - benchmarks/matmul/laser_gemm/gemm.nim | 511 ----- benchmarks/matmul/laser_gemm/gemm_packing.nim | 94 - .../matmul/laser_gemm/gemm_prepacked.nim | 525 ----- .../matmul/laser_gemm_backend/gemm_tiling.nim | 344 ---- .../laser_gemm_backend/gemm_ukernel_avx.nim | 44 - .../laser_gemm_backend/gemm_ukernel_avx2.nim | 35 - .../gemm_ukernel_avx512.nim | 74 - .../gemm_ukernel_avx_fma.nim | 38 - .../gemm_ukernel_dispatch.nim | 125 -- .../gemm_ukernel_generator.nim | 250 --- .../gemm_ukernel_generic.nim | 138 -- .../laser_gemm_backend/gemm_ukernel_sse.nim | 26 - .../laser_gemm_backend/gemm_ukernel_sse2.nim | 129 -- .../gemm_ukernel_sse4_1.nim | 35 - .../matmul/laser_gemm_backend/gemm_utils.nim | 60 - .../matmul/laser_utils/align_unroller.nim | 41 - .../laser_utils/compiler_optim_hints.nim | 149 -- benchmarks/matmul/laser_utils/memory.nim | 20 - benchmarks/matmul/laser_utils/openmp.nim | 386 ---- benchmarks/matmul/laser_utils/simd.nim | 441 ----- benchmarks/matmul/nim.cfg | 13 - benchmarks/matmul/weave_gemm/README.md | 1 - 27 files changed, 5585 deletions(-) delete mode 160000 benchmarks/cpuinfo delete mode 100644 benchmarks/cpuinfo.h delete mode 100644 benchmarks/cpuinfo.nim delete mode 100644 benchmarks/matmul/README.md delete mode 100644 benchmarks/matmul/laser_gemm/gemm.nim delete mode 100644 benchmarks/matmul/laser_gemm/gemm_packing.nim delete mode 100644 benchmarks/matmul/laser_gemm/gemm_prepacked.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_utils.nim delete mode 100644 benchmarks/matmul/laser_utils/align_unroller.nim delete mode 100644 benchmarks/matmul/laser_utils/compiler_optim_hints.nim delete mode 100644 benchmarks/matmul/laser_utils/memory.nim delete mode 100644 benchmarks/matmul/laser_utils/openmp.nim delete mode 100644 benchmarks/matmul/laser_utils/simd.nim delete mode 100644 benchmarks/matmul/nim.cfg delete mode 100644 benchmarks/matmul/weave_gemm/README.md diff --git a/.gitmodules b/.gitmodules index 471fcd1..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "benchmarks/cpuinfo"] - path = benchmarks/cpuinfo - url = https://github.com/pytorch/cpuinfo diff --git a/benchmarks/cpuinfo b/benchmarks/cpuinfo deleted file mode 160000 index d5e37ad..0000000 --- a/benchmarks/cpuinfo +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d5e37adf1406cf899d7d9ec1d317c47506ccb970 diff --git a/benchmarks/cpuinfo.h b/benchmarks/cpuinfo.h deleted file mode 100644 index 1abb806..0000000 --- a/benchmarks/cpuinfo.h +++ /dev/null @@ -1,1693 +0,0 @@ -#pragma once -#ifndef CPUINFO_H -#define CPUINFO_H - -#ifndef __cplusplus - #include -#endif - -#ifdef __APPLE__ - #include -#endif - -#include - -/* Identify architecture and define corresponding macro */ - -#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) - #define CPUINFO_ARCH_X86 1 -#endif - -#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) - #define CPUINFO_ARCH_X86_64 1 -#endif - -#if defined(__arm__) || defined(_M_ARM) - #define CPUINFO_ARCH_ARM 1 -#endif - -#if defined(__aarch64__) || defined(_M_ARM64) - #define CPUINFO_ARCH_ARM64 1 -#endif - -#if defined(__PPC64__) || defined(__powerpc64__) || defined(_ARCH_PPC64) - #define CPUINFO_ARCH_PPC64 1 -#endif - -#if defined(__pnacl__) - #define CPUINFO_ARCH_PNACL 1 -#endif - -#if defined(EMSCRIPTEN) - #define CPUINFO_ARCH_ASMJS 1 -#endif - -#if CPUINFO_ARCH_X86 && defined(_MSC_VER) - #define CPUINFO_ABI __cdecl -#elif CPUINFO_ARCH_X86 && defined(__GNUC__) - #define CPUINFO_ABI __attribute__((__cdecl__)) -#else - #define CPUINFO_ABI -#endif - -/* Define other architecture-specific macros as 0 */ - -#ifndef CPUINFO_ARCH_X86 - #define CPUINFO_ARCH_X86 0 -#endif - -#ifndef CPUINFO_ARCH_X86_64 - #define CPUINFO_ARCH_X86_64 0 -#endif - -#ifndef CPUINFO_ARCH_ARM - #define CPUINFO_ARCH_ARM 0 -#endif - -#ifndef CPUINFO_ARCH_ARM64 - #define CPUINFO_ARCH_ARM64 0 -#endif - -#ifndef CPUINFO_ARCH_PPC64 - #define CPUINFO_ARCH_PPC64 0 -#endif - -#ifndef CPUINFO_ARCH_PNACL - #define CPUINFO_ARCH_PNACL 0 -#endif - -#ifndef CPUINFO_ARCH_ASMJS - #define CPUINFO_ARCH_ASMJS 0 -#endif - -#define CPUINFO_CACHE_UNIFIED 0x00000001 -#define CPUINFO_CACHE_INCLUSIVE 0x00000002 -#define CPUINFO_CACHE_COMPLEX_INDEXING 0x00000004 - -struct cpuinfo_cache { - /** Cache size in bytes */ - uint32_t size; - /** Number of ways of associativity */ - uint32_t associativity; - /** Number of sets */ - uint32_t sets; - /** Number of partitions */ - uint32_t partitions; - /** Line size in bytes */ - uint32_t line_size; - /** - * Binary characteristics of the cache (unified cache, inclusive cache, cache with complex indexing). - * - * @see CPUINFO_CACHE_UNIFIED, CPUINFO_CACHE_INCLUSIVE, CPUINFO_CACHE_COMPLEX_INDEXING - */ - uint32_t flags; - /** Index of the first logical processor that shares this cache */ - uint32_t processor_start; - /** Number of logical processors that share this cache */ - uint32_t processor_count; -}; - -struct cpuinfo_trace_cache { - uint32_t uops; - uint32_t associativity; -}; - -#define CPUINFO_PAGE_SIZE_4KB 0x1000 -#define CPUINFO_PAGE_SIZE_1MB 0x100000 -#define CPUINFO_PAGE_SIZE_2MB 0x200000 -#define CPUINFO_PAGE_SIZE_4MB 0x400000 -#define CPUINFO_PAGE_SIZE_16MB 0x1000000 -#define CPUINFO_PAGE_SIZE_1GB 0x40000000 - -struct cpuinfo_tlb { - uint32_t entries; - uint32_t associativity; - uint64_t pages; -}; - -/** Vendor of processor core design */ -enum cpuinfo_vendor { - /** Processor vendor is not known to the library, or the library failed to get vendor information from the OS. */ - cpuinfo_vendor_unknown = 0, - - /* Active vendors of modern CPUs */ - - /** - * Intel Corporation. Vendor of x86, x86-64, IA64, and ARM processor microarchitectures. - * - * Sold its ARM design subsidiary in 2006. The last ARM processor design was released in 2004. - */ - cpuinfo_vendor_intel = 1, - /** Advanced Micro Devices, Inc. Vendor of x86 and x86-64 processor microarchitectures. */ - cpuinfo_vendor_amd = 2, - /** ARM Holdings plc. Vendor of ARM and ARM64 processor microarchitectures. */ - cpuinfo_vendor_arm = 3, - /** Qualcomm Incorporated. Vendor of ARM and ARM64 processor microarchitectures. */ - cpuinfo_vendor_qualcomm = 4, - /** Apple Inc. Vendor of ARM and ARM64 processor microarchitectures. */ - cpuinfo_vendor_apple = 5, - /** Samsung Electronics Co., Ltd. Vendir if ARM64 processor microarchitectures. */ - cpuinfo_vendor_samsung = 6, - /** Nvidia Corporation. Vendor of ARM64-compatible processor microarchitectures. */ - cpuinfo_vendor_nvidia = 7, - /** MIPS Technologies, Inc. Vendor of MIPS processor microarchitectures. */ - cpuinfo_vendor_mips = 8, - /** International Business Machines Corporation. Vendor of PowerPC processor microarchitectures. */ - cpuinfo_vendor_ibm = 9, - /** Ingenic Semiconductor. Vendor of MIPS processor microarchitectures. */ - cpuinfo_vendor_ingenic = 10, - /** - * VIA Technologies, Inc. Vendor of x86 and x86-64 processor microarchitectures. - * - * Processors are designed by Centaur Technology, a subsidiary of VIA Technologies. - */ - cpuinfo_vendor_via = 11, - /** Cavium, Inc. Vendor of ARM64 processor microarchitectures. */ - cpuinfo_vendor_cavium = 12, - /** Broadcom, Inc. Vendor of ARM processor microarchitectures. */ - cpuinfo_vendor_broadcom = 13, - /** Applied Micro Circuits Corporation (APM). Vendor of ARM64 processor microarchitectures. */ - cpuinfo_vendor_apm = 14, - /** - * Huawei Technologies Co., Ltd. Vendor of ARM64 processor microarchitectures. - * - * Processors are designed by HiSilicon, a subsidiary of Huawei. - */ - cpuinfo_vendor_huawei = 15, - - /* Active vendors of embedded CPUs */ - - /** Texas Instruments Inc. Vendor of ARM processor microarchitectures. */ - cpuinfo_vendor_texas_instruments = 30, - /** Marvell Technology Group Ltd. Vendor of ARM processor microarchitectures. */ - cpuinfo_vendor_marvell = 31, - /** RDC Semiconductor Co., Ltd. Vendor of x86 processor microarchitectures. */ - cpuinfo_vendor_rdc = 32, - /** DM&P Electronics Inc. Vendor of x86 processor microarchitectures. */ - cpuinfo_vendor_dmp = 33, - /** Motorola, Inc. Vendor of PowerPC and ARM processor microarchitectures. */ - cpuinfo_vendor_motorola = 34, - - /* Defunct CPU vendors */ - - /** - * Transmeta Corporation. Vendor of x86 processor microarchitectures. - * - * Now defunct. The last processor design was released in 2004. - * Transmeta processors implemented VLIW ISA and used binary translation to execute x86 code. - */ - cpuinfo_vendor_transmeta = 50, - /** - * Cyrix Corporation. Vendor of x86 processor microarchitectures. - * - * Now defunct. The last processor design was released in 1996. - */ - cpuinfo_vendor_cyrix = 51, - /** - * Rise Technology. Vendor of x86 processor microarchitectures. - * - * Now defunct. The last processor design was released in 1999. - */ - cpuinfo_vendor_rise = 52, - /** - * National Semiconductor. Vendor of x86 processor microarchitectures. - * - * Sold its x86 design subsidiary in 1999. The last processor design was released in 1998. - */ - cpuinfo_vendor_nsc = 53, - /** - * Silicon Integrated Systems. Vendor of x86 processor microarchitectures. - * - * Sold its x86 design subsidiary in 2001. The last processor design was released in 2001. - */ - cpuinfo_vendor_sis = 54, - /** - * NexGen. Vendor of x86 processor microarchitectures. - * - * Now defunct. The last processor design was released in 1994. - * NexGen designed the first x86 microarchitecture which decomposed x86 instructions into simple microoperations. - */ - cpuinfo_vendor_nexgen = 55, - /** - * United Microelectronics Corporation. Vendor of x86 processor microarchitectures. - * - * Ceased x86 in the early 1990s. The last processor design was released in 1991. - * Designed U5C and U5D processors. Both are 486 level. - */ - cpuinfo_vendor_umc = 56, - /** - * Digital Equipment Corporation. Vendor of ARM processor microarchitecture. - * - * Sold its ARM designs in 1997. The last processor design was released in 1997. - */ - cpuinfo_vendor_dec = 57, -}; - -/** - * Processor microarchitecture - * - * Processors with different microarchitectures often have different instruction performance characteristics, - * and may have dramatically different pipeline organization. - */ -enum cpuinfo_uarch { - /** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */ - cpuinfo_uarch_unknown = 0, - - /** Pentium and Pentium MMX microarchitecture. */ - cpuinfo_uarch_p5 = 0x00100100, - /** Intel Quark microarchitecture. */ - cpuinfo_uarch_quark = 0x00100101, - - /** Pentium Pro, Pentium II, and Pentium III. */ - cpuinfo_uarch_p6 = 0x00100200, - /** Pentium M. */ - cpuinfo_uarch_dothan = 0x00100201, - /** Intel Core microarchitecture. */ - cpuinfo_uarch_yonah = 0x00100202, - /** Intel Core 2 microarchitecture on 65 nm process. */ - cpuinfo_uarch_conroe = 0x00100203, - /** Intel Core 2 microarchitecture on 45 nm process. */ - cpuinfo_uarch_penryn = 0x00100204, - /** Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). */ - cpuinfo_uarch_nehalem = 0x00100205, - /** Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). */ - cpuinfo_uarch_sandy_bridge = 0x00100206, - /** Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). */ - cpuinfo_uarch_ivy_bridge = 0x00100207, - /** Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). */ - cpuinfo_uarch_haswell = 0x00100208, - /** Intel Broadwell microarchitecture. */ - cpuinfo_uarch_broadwell = 0x00100209, - /** Intel Sky Lake microarchitecture. */ - cpuinfo_uarch_sky_lake = 0x0010020A, - /** Intel Kaby Lake microarchitecture. */ - cpuinfo_uarch_kaby_lake = 0x0010020B, - - /** Pentium 4 with Willamette, Northwood, or Foster cores. */ - cpuinfo_uarch_willamette = 0x00100300, - /** Pentium 4 with Prescott and later cores. */ - cpuinfo_uarch_prescott = 0x00100301, - - /** Intel Atom on 45 nm process. */ - cpuinfo_uarch_bonnell = 0x00100400, - /** Intel Atom on 32 nm process. */ - cpuinfo_uarch_saltwell = 0x00100401, - /** Intel Silvermont microarchitecture (22 nm out-of-order Atom). */ - cpuinfo_uarch_silvermont = 0x00100402, - /** Intel Airmont microarchitecture (14 nm out-of-order Atom). */ - cpuinfo_uarch_airmont = 0x00100403, - - /** Intel Knights Ferry HPC boards. */ - cpuinfo_uarch_knights_ferry = 0x00100500, - /** Intel Knights Corner HPC boards (aka Xeon Phi). */ - cpuinfo_uarch_knights_corner = 0x00100501, - /** Intel Knights Landing microarchitecture (second-gen MIC). */ - cpuinfo_uarch_knights_landing = 0x00100502, - /** Intel Knights Hill microarchitecture (third-gen MIC). */ - cpuinfo_uarch_knights_hill = 0x00100503, - /** Intel Knights Mill Xeon Phi. */ - cpuinfo_uarch_knights_mill = 0x00100504, - - /** Intel/Marvell XScale series. */ - cpuinfo_uarch_xscale = 0x00100600, - - /** AMD K5. */ - cpuinfo_uarch_k5 = 0x00200100, - /** AMD K6 and alike. */ - cpuinfo_uarch_k6 = 0x00200101, - /** AMD Athlon and Duron. */ - cpuinfo_uarch_k7 = 0x00200102, - /** AMD Athlon 64, Opteron 64. */ - cpuinfo_uarch_k8 = 0x00200103, - /** AMD Family 10h (Barcelona, Istambul, Magny-Cours). */ - cpuinfo_uarch_k10 = 0x00200104, - /** - * AMD Bulldozer microarchitecture - * Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs. - */ - cpuinfo_uarch_bulldozer = 0x00200105, - /** - * AMD Piledriver microarchitecture - * Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs. - */ - cpuinfo_uarch_piledriver = 0x00200106, - /** AMD Steamroller microarchitecture (Kaveri APUs). */ - cpuinfo_uarch_steamroller = 0x00200107, - /** AMD Excavator microarchitecture (Carizzo APUs). */ - cpuinfo_uarch_excavator = 0x00200108, - /** AMD Zen microarchitecture (Ryzen CPUs). */ - cpuinfo_uarch_zen = 0x00200109, - - /** NSC Geode and AMD Geode GX and LX. */ - cpuinfo_uarch_geode = 0x00200200, - /** AMD Bobcat mobile microarchitecture. */ - cpuinfo_uarch_bobcat = 0x00200201, - /** AMD Jaguar mobile microarchitecture. */ - cpuinfo_uarch_jaguar = 0x00200202, - /** AMD Puma mobile microarchitecture. */ - cpuinfo_uarch_puma = 0x00200203, - - /** ARM7 series. */ - cpuinfo_uarch_arm7 = 0x00300100, - /** ARM9 series. */ - cpuinfo_uarch_arm9 = 0x00300101, - /** ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. */ - cpuinfo_uarch_arm11 = 0x00300102, - - /** ARM Cortex-A5. */ - cpuinfo_uarch_cortex_a5 = 0x00300205, - /** ARM Cortex-A7. */ - cpuinfo_uarch_cortex_a7 = 0x00300207, - /** ARM Cortex-A8. */ - cpuinfo_uarch_cortex_a8 = 0x00300208, - /** ARM Cortex-A9. */ - cpuinfo_uarch_cortex_a9 = 0x00300209, - /** ARM Cortex-A12. */ - cpuinfo_uarch_cortex_a12 = 0x00300212, - /** ARM Cortex-A15. */ - cpuinfo_uarch_cortex_a15 = 0x00300215, - /** ARM Cortex-A17. */ - cpuinfo_uarch_cortex_a17 = 0x00300217, - - /** ARM Cortex-A32. */ - cpuinfo_uarch_cortex_a32 = 0x00300332, - /** ARM Cortex-A35. */ - cpuinfo_uarch_cortex_a35 = 0x00300335, - /** ARM Cortex-A53. */ - cpuinfo_uarch_cortex_a53 = 0x00300353, - /** ARM Cortex-A55. */ - cpuinfo_uarch_cortex_a55 = 0x00300355, - /** ARM Cortex-A57. */ - cpuinfo_uarch_cortex_a57 = 0x00300357, - /** ARM Cortex-A72. */ - cpuinfo_uarch_cortex_a72 = 0x00300372, - /** ARM Cortex-A73. */ - cpuinfo_uarch_cortex_a73 = 0x00300373, - /** ARM Cortex-A75. */ - cpuinfo_uarch_cortex_a75 = 0x00300375, - /** ARM Cortex-A76. */ - cpuinfo_uarch_cortex_a76 = 0x00300376, - - /** Qualcomm Scorpion. */ - cpuinfo_uarch_scorpion = 0x00400100, - /** Qualcomm Krait. */ - cpuinfo_uarch_krait = 0x00400101, - /** Qualcomm Kryo. */ - cpuinfo_uarch_kryo = 0x00400102, - /** Qualcomm Falkor. */ - cpuinfo_uarch_falkor = 0x00400103, - /** Qualcomm Saphira. */ - cpuinfo_uarch_saphira = 0x00400104, - - /** Nvidia Denver. */ - cpuinfo_uarch_denver = 0x00500100, - /** Nvidia Denver 2. */ - cpuinfo_uarch_denver2 = 0x00500101, - /** Nvidia Carmel. */ - cpuinfo_uarch_carmel = 0x00500102, - - /** Samsung Mongoose M1 (Exynos 8890 big cores). */ - cpuinfo_uarch_mongoose_m1 = 0x00600100, - /** Samsung Mongoose M2 (Exynos 8895 big cores). */ - cpuinfo_uarch_mongoose_m2 = 0x00600101, - /** Samsung Meerkat M3 (Exynos 9810 big cores). */ - cpuinfo_uarch_meerkat_m3 = 0x00600102, - - /** Apple A6 and A6X processors. */ - cpuinfo_uarch_swift = 0x00700100, - /** Apple A7 processor. */ - cpuinfo_uarch_cyclone = 0x00700101, - /** Apple A8 and A8X processor. */ - cpuinfo_uarch_typhoon = 0x00700102, - /** Apple A9 and A9X processor. */ - cpuinfo_uarch_twister = 0x00700103, - /** Apple A10 and A10X processor. */ - cpuinfo_uarch_hurricane = 0x00700104, - /** Apple A11 processor (big cores). */ - cpuinfo_uarch_monsoon = 0x00700105, - /** Apple A11 processor (little cores). */ - cpuinfo_uarch_mistral = 0x00700106, - - /** Cavium ThunderX. */ - cpuinfo_uarch_thunderx = 0x00800100, - /** Cavium ThunderX2 (originally Broadcom Vulkan). */ - cpuinfo_uarch_thunderx2 = 0x00800200, - - /** Marvell PJ4. */ - cpuinfo_uarch_pj4 = 0x00900100, - - /** Broadcom Brahma B15. */ - cpuinfo_uarch_brahma_b15 = 0x00A00100, - /** Broadcom Brahma B53. */ - cpuinfo_uarch_brahma_b53 = 0x00A00101, - - /** Applied Micro X-Gene. */ - cpuinfo_uarch_xgene = 0x00B00100, -}; - -struct cpuinfo_processor { - /** SMT (hyperthread) ID within a core */ - uint32_t smt_id; - /** Core containing this logical processor */ - const struct cpuinfo_core* core; - /** Cluster of cores containing this logical processor */ - const struct cpuinfo_cluster* cluster; - /** Physical package containing this logical processor */ - const struct cpuinfo_package* package; -#if defined(__linux__) - /** - * Linux-specific ID for the logical processor: - * - Linux kernel exposes information about this logical processor in /sys/devices/system/cpu/cpu/ - * - Bit in the cpu_set_t identifies this logical processor - */ - int linux_id; -#endif -#if defined(_WIN32) - /** Windows-specific ID for the group containing the logical processor. */ - uint16_t windows_group_id; - /** - * Windows-specific ID of the logical processor within its group: - * - Bit in the KAFFINITY mask identifies this logical processor within its group. - */ - uint16_t windows_processor_id; -#endif -#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - /** APIC ID (unique x86-specific ID of the logical processor) */ - uint32_t apic_id; -#endif - struct { - /** Level 1 instruction cache */ - const struct cpuinfo_cache* l1i; - /** Level 1 data cache */ - const struct cpuinfo_cache* l1d; - /** Level 2 unified or data cache */ - const struct cpuinfo_cache* l2; - /** Level 3 unified or data cache */ - const struct cpuinfo_cache* l3; - /** Level 4 unified or data cache */ - const struct cpuinfo_cache* l4; - } cache; -}; - -struct cpuinfo_core { - /** Index of the first logical processor on this core. */ - uint32_t processor_start; - /** Number of logical processors on this core */ - uint32_t processor_count; - /** Core ID within a package */ - uint32_t core_id; - /** Cluster containing this core */ - const struct cpuinfo_cluster* cluster; - /** Physical package containing this core. */ - const struct cpuinfo_package* package; - /** Vendor of the CPU microarchitecture for this core */ - enum cpuinfo_vendor vendor; - /** CPU microarchitecture for this core */ - enum cpuinfo_uarch uarch; -#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - /** Value of CPUID leaf 1 EAX register for this core */ - uint32_t cpuid; -#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - /** Value of Main ID Register (MIDR) for this core */ - uint32_t midr; -#endif - /** Clock rate (non-Turbo) of the core, in Hz */ - uint64_t frequency; -}; - -struct cpuinfo_cluster { - /** Index of the first logical processor in the cluster */ - uint32_t processor_start; - /** Number of logical processors in the cluster */ - uint32_t processor_count; - /** Index of the first core in the cluster */ - uint32_t core_start; - /** Number of cores on the cluster */ - uint32_t core_count; - /** Cluster ID within a package */ - uint32_t cluster_id; - /** Physical package containing the cluster */ - const struct cpuinfo_package* package; - /** CPU microarchitecture vendor of the cores in the cluster */ - enum cpuinfo_vendor vendor; - /** CPU microarchitecture of the cores in the cluster */ - enum cpuinfo_uarch uarch; -#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - /** Value of CPUID leaf 1 EAX register of the cores in the cluster */ - uint32_t cpuid; -#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - /** Value of Main ID Register (MIDR) of the cores in the cluster */ - uint32_t midr; -#endif - /** Clock rate (non-Turbo) of the cores in the cluster, in Hz */ - uint64_t frequency; -}; - -#define CPUINFO_PACKAGE_NAME_MAX 48 -#define CPUINFO_GPU_NAME_MAX 64 - -struct cpuinfo_package { - /** SoC or processor chip model name */ - char name[CPUINFO_PACKAGE_NAME_MAX]; -#if defined(__ANDROID__) || (defined(__APPLE__) && TARGET_OS_IPHONE) - /** Integrated GPU model name */ - char gpu_name[CPUINFO_GPU_NAME_MAX]; -#endif - /** Index of the first logical processor on this physical package */ - uint32_t processor_start; - /** Number of logical processors on this physical package */ - uint32_t processor_count; - /** Index of the first core on this physical package */ - uint32_t core_start; - /** Number of cores on this physical package */ - uint32_t core_count; - /** Index of the first cluster of cores on this physical package */ - uint32_t cluster_start; - /** Number of clusters of cores on this physical package */ - uint32_t cluster_count; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -bool CPUINFO_ABI cpuinfo_initialize(void); - -void CPUINFO_ABI cpuinfo_deinitialize(void); - -#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - /* This structure is not a part of stable API. Use cpuinfo_has_x86_* functions instead. */ - struct cpuinfo_x86_isa { - #if CPUINFO_ARCH_X86 - bool rdtsc; - #endif - bool rdtscp; - bool rdpid; - bool sysenter; - #if CPUINFO_ARCH_X86 - bool syscall; - #endif - bool msr; - bool clzero; - bool clflush; - bool clflushopt; - bool mwait; - bool mwaitx; - #if CPUINFO_ARCH_X86 - bool emmx; - #endif - bool fxsave; - bool xsave; - #if CPUINFO_ARCH_X86 - bool fpu; - bool mmx; - bool mmx_plus; - #endif - bool three_d_now; - bool three_d_now_plus; - #if CPUINFO_ARCH_X86 - bool three_d_now_geode; - #endif - bool prefetch; - bool prefetchw; - bool prefetchwt1; - #if CPUINFO_ARCH_X86 - bool daz; - bool sse; - bool sse2; - #endif - bool sse3; - bool ssse3; - bool sse4_1; - bool sse4_2; - bool sse4a; - bool misaligned_sse; - bool avx; - bool fma3; - bool fma4; - bool xop; - bool f16c; - bool avx2; - bool avx512f; - bool avx512pf; - bool avx512er; - bool avx512cd; - bool avx512dq; - bool avx512bw; - bool avx512vl; - bool avx512ifma; - bool avx512vbmi; - bool avx512vbmi2; - bool avx512bitalg; - bool avx512vpopcntdq; - bool avx512vnni; - bool avx512_4vnniw; - bool avx512_4fmaps; - bool hle; - bool rtm; - bool xtest; - bool mpx; - #if CPUINFO_ARCH_X86 - bool cmov; - bool cmpxchg8b; - #endif - bool cmpxchg16b; - bool clwb; - bool movbe; - #if CPUINFO_ARCH_X86_64 - bool lahf_sahf; - #endif - bool fs_gs_base; - bool lzcnt; - bool popcnt; - bool tbm; - bool bmi; - bool bmi2; - bool adx; - bool aes; - bool vaes; - bool pclmulqdq; - bool vpclmulqdq; - bool gfni; - bool rdrand; - bool rdseed; - bool sha; - bool rng; - bool ace; - bool ace2; - bool phe; - bool pmm; - bool lwp; - }; - - extern struct cpuinfo_x86_isa cpuinfo_isa; -#endif - -static inline bool cpuinfo_has_x86_rdtsc(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.rdtsc; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_rdtscp(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.rdtscp; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_rdpid(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.rdpid; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_clzero(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.clzero; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_mwait(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.mwait; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_mwaitx(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.mwaitx; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_fxsave(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.fxsave; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_xsave(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.xsave; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_fpu(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.fpu; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_mmx(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.mmx; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_mmx_plus(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.mmx_plus; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_3dnow(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.three_d_now; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_3dnow_plus(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.three_d_now_plus; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_3dnow_geode(void) { - #if CPUINFO_ARCH_X86_64 - return false; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return false; - #else - return cpuinfo_isa.three_d_now_geode; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_prefetch(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.prefetch; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_prefetchw(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.prefetchw; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_prefetchwt1(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.prefetchwt1; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_daz(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.daz; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.sse; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse2(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.sse2; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse3(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.sse3; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_ssse3(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.ssse3; - #endif - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse4_1(void) { - #if CPUINFO_ARCH_X86_64 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.sse4_1; - #endif - #elif CPUINFO_ARCH_X86 - return cpuinfo_isa.sse4_1; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse4_2(void) { - #if CPUINFO_ARCH_X86_64 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.sse4_2; - #endif - #elif CPUINFO_ARCH_X86 - return cpuinfo_isa.sse4_2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sse4a(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.sse4a; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_misaligned_sse(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.misaligned_sse; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_fma3(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.fma3; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_fma4(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.fma4; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_xop(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.xop; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_f16c(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.f16c; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx2(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512f(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512f; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512pf(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512pf; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512er(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512er; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512cd(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512cd; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512dq(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512dq; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512bw(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512bw; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512vl(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512vl; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512ifma(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512ifma; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512vbmi(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512vbmi; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512vbmi2(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512vbmi2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512bitalg(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512bitalg; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512vpopcntdq(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512vpopcntdq; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512vnni(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512vnni; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512_4vnniw(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512_4vnniw; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_avx512_4fmaps(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.avx512_4fmaps; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_hle(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.hle; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_rtm(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.rtm; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_xtest(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.xtest; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_mpx(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.mpx; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_cmov(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - return cpuinfo_isa.cmov; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_cmpxchg8b(void) { - #if CPUINFO_ARCH_X86_64 - return true; - #elif CPUINFO_ARCH_X86 - return cpuinfo_isa.cmpxchg8b; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_cmpxchg16b(void) { - #if CPUINFO_ARCH_X86_64 - return cpuinfo_isa.cmpxchg16b; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_clwb(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.clwb; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_movbe(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.movbe; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_lahf_sahf(void) { - #if CPUINFO_ARCH_X86 - return true; - #elif CPUINFO_ARCH_X86_64 - return cpuinfo_isa.lahf_sahf; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_lzcnt(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.lzcnt; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_popcnt(void) { - #if CPUINFO_ARCH_X86_64 - #if defined(__ANDROID__) - return true; - #else - return cpuinfo_isa.popcnt; - #endif - #elif CPUINFO_ARCH_X86 - return cpuinfo_isa.popcnt; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_tbm(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.tbm; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_bmi(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.bmi; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_bmi2(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.bmi2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_adx(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.adx; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_aes(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.aes; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_vaes(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.vaes; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_pclmulqdq(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.pclmulqdq; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_vpclmulqdq(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.vpclmulqdq; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_gfni(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.gfni; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_rdrand(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.rdrand; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_rdseed(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.rdseed; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_x86_sha(void) { - #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 - return cpuinfo_isa.sha; - #else - return false; - #endif -} - -#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - /* This structure is not a part of stable API. Use cpuinfo_has_arm_* functions instead. */ - struct cpuinfo_arm_isa { - #if CPUINFO_ARCH_ARM - bool thumb; - bool thumb2; - bool thumbee; - bool jazelle; - bool armv5e; - bool armv6; - bool armv6k; - bool armv7; - bool armv7mp; - bool idiv; - - bool vfpv2; - bool vfpv3; - bool d32; - bool fp16; - bool fma; - - bool wmmx; - bool wmmx2; - bool neon; - #endif - #if CPUINFO_ARCH_ARM64 - bool atomics; - #endif - bool rdm; - bool fp16arith; - bool jscvt; - bool fcma; - - bool aes; - bool sha1; - bool sha2; - bool pmull; - bool crc32; - }; - - extern struct cpuinfo_arm_isa cpuinfo_isa; -#endif - -static inline bool cpuinfo_has_arm_thumb(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.thumb; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_thumb2(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.thumb2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_v5e(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.armv5e; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_v6(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.armv6; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_v6k(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.armv6k; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_v7(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.armv7; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_v7mp(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.armv7mp; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_idiv(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.idiv; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv2(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv3(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv3_d32(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3 && cpuinfo_isa.d32; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv3_fp16(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv3_fp16_d32(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16 && cpuinfo_isa.d32; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv4(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_vfpv4_d32(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma && cpuinfo_isa.d32; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_wmmx(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.wmmx; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_wmmx2(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.wmmx2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_neon(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.neon; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_neon_fp16(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.neon && cpuinfo_isa.fp16; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_neon_fma(void) { - #if CPUINFO_ARCH_ARM64 - return true; - #elif CPUINFO_ARCH_ARM - return cpuinfo_isa.neon && cpuinfo_isa.fma; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_atomics(void) { - #if CPUINFO_ARCH_ARM64 - return cpuinfo_isa.atomics; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_neon_rdm(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.rdm; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_neon_fp16_arith(void) { - #if CPUINFO_ARCH_ARM - return cpuinfo_isa.neon && cpuinfo_isa.fp16arith; - #elif CPUINFO_ARCH_ARM64 - return cpuinfo_isa.fp16arith; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_fp16_arith(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.fp16arith; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_jscvt(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.jscvt; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_fcma(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.fcma; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_aes(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.aes; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_sha1(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.sha1; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_sha2(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.sha2; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_pmull(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.pmull; - #else - return false; - #endif -} - -static inline bool cpuinfo_has_arm_crc32(void) { - #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 - return cpuinfo_isa.crc32; - #else - return false; - #endif -} - -const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processors(void); -const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_cores(void); -const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_clusters(void); -const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_packages(void); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_caches(void); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_caches(void); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_caches(void); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_caches(void); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_caches(void); - -const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processor(uint32_t index); -const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_core(uint32_t index); -const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_cluster(uint32_t index); -const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_package(uint32_t index); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_cache(uint32_t index); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_cache(uint32_t index); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_cache(uint32_t index); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_cache(uint32_t index); -const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_cache(uint32_t index); - -uint32_t CPUINFO_ABI cpuinfo_get_processors_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_cores_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_clusters_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_packages_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_l1i_caches_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_l1d_caches_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_l2_caches_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_l3_caches_count(void); -uint32_t CPUINFO_ABI cpuinfo_get_l4_caches_count(void); - -const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_current_processor(void); -const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_current_core(void); - -// Patch to avoid "error: must use 'struct' tag to refer to type 'cpuinfo_processor'" -// in Nim code. Also we can't reuse the struct name for typedef, -// it would conflicts with extern struct in api.h -typedef struct cpuinfo_cache cpuinfo_cache_exported; -typedef struct cpuinfo_processor cpuinfo_processor_exported; -typedef struct cpuinfo_core cpuinfo_core_exported; -typedef struct cpuinfo_cluster cpuinfo_cluster_exported; -typedef struct cpuinfo_package cpuinfo_package_exported; - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* CPUINFO_H */ diff --git a/benchmarks/cpuinfo.nim b/benchmarks/cpuinfo.nim deleted file mode 100644 index 6411937..0000000 --- a/benchmarks/cpuinfo.nim +++ /dev/null @@ -1,371 +0,0 @@ -# Original cpuinfo.h header -# Copyright (c) 2017-2018 Facebook Inc. -# Copyright (C) 2012-2017 Georgia Institute of Technology -# Copyright (C) 2010-2012 Marat Dukhan -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# Nim wrapper - Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -from strutils import split, rsplit -from os import DirSep - -const - curSrcFolder = currentSourcePath.rsplit(DirSep, 1)[0] - cpuinfoPath = curSrcFolder & DirSep & "cpuinfo" & DirSep - # We use a patched header as the original one doesn't typedef its struct - # which lead to "error: must use 'struct' tag to refer to type 'cpuinfo_processor'" - # headerPath = cpuinfoPath & DirSep & "include" & DirSep & "cpuinfo.h" - headerPath = curSrcFolder & DirSep & "cpuinfo.h" - -########################################### -############### Public API ################ - -{.pragma:cpuinfo_type, header: headerPath, bycopy.} - -# Check compiler defined consts in: -# - https://github.com/nim-lang/Nim/blob/devel/compiler/platform.nim - -type - CPUInfo_cache* {.importc: "cpuinfo_cache_exported", cpuinfo_type.} = object - size* {.importc.}: uint32 - associativity* {.importc.}: uint32 - sets* {.importc.}: uint32 - partitions* {.importc.}: uint32 - line_size* {.importc.}: uint32 - flags* {.importc.}: uint32 - processor_start* {.importc.}: uint32 - processor_count* {.importc.}: uint32 - - ProcCache* {.bycopy.} = object - l1i*: ptr CPUInfo_cache - l1d*: ptr CPUInfo_cache - l2*: ptr CPUInfo_cache - l3*: ptr CPUInfo_cache - l4*: ptr CPUInfo_cache - - CPUInfo_processor* {.importc: "cpuinfo_processor_exported", cpuinfo_type.} = object - smt_id* {.importc.}: uint32 - core* {.importc.}: ptr CPUInfo_core - cluster* {.importc.}: ptr CPUInfo_cluster - package* {.importc.}: ptr CPUInfo_package - cache* {.importc.}: ptr ProcCache - - CPUInfo_core* {.importc: "cpuinfo_core_exported", cpuinfo_type.} = object - processor_start* {.importc.}: uint32 - processor_count* {.importc.}: uint32 - core_id* {.importc.}: uint32 - cluster* {.importc.}: ptr CPUInfo_cluster - package* {.importc.}: ptr CPUInfo_package - vendor* {.importc.}: CPUInfo_vendor - uarch* {.importc.}: CPUInfo_uarch - when defined(i386) or defined(amd64): - cpuid* {.importc.}: uint32 - when defined(arm) or defined(arm64): - midr* {.importc.}: uint32 - frequency* {.importc.}: uint64 - - CPUInfo_cluster* {.importc: "cpuinfo_cluster_exported", cpuinfo_type.} = object - processor_start* {.importc.}: uint32 - processor_count* {.importc.}: uint32 - core_start* {.importc.}: uint32 - core_count* {.importc.}: uint32 - cluster_id* {.importc.}: uint32 - package* {.importc.}: ptr CPUInfo_package - vendor* {.importc.}: CPUInfo_vendor - uarch* {.importc.}: CPUInfo_uarch - when defined(i386) or defined(amd64): - cpuid* {.importc.}: uint32 - when defined(arm) or defined(arm64): - midr* {.importc.}: uint32 - frequency* {.importc.}: uint64 - - CPUInfo_package* {.importc: "cpuinfo_package_exported", cpuinfo_type.} = object - name* {.importc.}: array[48, char] - when defined(android) or defined(ios): - # Make sure iOS is defined - https://github.com/nim-lang/Nim/issues/9369 - gpu_name* {.importc.}: array[64, char] - processor_start* {.importc.}: uint32 - processor_count* {.importc.}: uint32 - core_start* {.importc.}: uint32 - core_count* {.importc.}: uint32 - cluster_start* {.importc.}: uint32 - cluster_count* {.importc.}: uint32 - - CPUInfo_vendor* {.size: sizeof(cint).} = enum - ## * Processor vendor is not known to the library, or the library failed to get vendor information from the OS. - cpuinfo_vendor_unknown = 0, - ## Active vendors of modern CPUs - cpuinfo_vendor_intel = 1, - cpuinfo_vendor_amd = 2, - cpuinfo_vendor_arm = 3, - cpuinfo_vendor_qualcomm = 4, - cpuinfo_vendor_apple = 5, - cpuinfo_vendor_samsung = 6, - cpuinfo_vendor_nvidia = 7, - cpuinfo_vendor_mips = 8, - cpuinfo_vendor_ibm = 9, - cpuinfo_vendor_ingenic = 10, - cpuinfo_vendor_via = 11, - cpuinfo_vendor_cavium = 12, - cpuinfo_vendor_broadcom = 13, - cpuinfo_vendor_apm = 14, # Applied Micro Circuits Corporation (APM) - cpuinfo_vendor_huawei = 15, - cpuinfo_vendor_texas_instruments = 30, - cpuinfo_vendor_marvell = 31, - cpuinfo_vendor_rdc = 32, # RDC Semiconductor Co. - cpuinfo_vendor_dmp = 33, # DM&P Electronics Inc. - cpuinfo_vendor_motorola = 34, - cpuinfo_vendor_transmeta = 50, - cpuinfo_vendor_cyrix = 51, - cpuinfo_vendor_rise = 52, - cpuinfo_vendor_nsc = 53, # National Semiconductor - cpuinfo_vendor_sis = 54, # Silicon Integrated Systems - cpuinfo_vendor_nexgen = 55, - cpuinfo_vendor_umc = 56, # United Microelectronics Corporation - cpuinfo_vendor_dec = 57 # Digital Equipment Corporation - - CPUInfo_uarch* {.size: sizeof(cint).} = enum - cpuinfo_uarch_unknown = 0, ## Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS - cpuinfo_uarch_p5 = 0x00100100, ## Pentium and Pentium MMX microarchitecture. - cpuinfo_uarch_quark = 0x00100101, ## Intel Quark microarchitecture. - cpuinfo_uarch_p6 = 0x00100200, ## Pentium Pro, Pentium II, and Pentium III. - cpuinfo_uarch_dothan = 0x00100201, ## Pentium M. - cpuinfo_uarch_yonah = 0x00100202, ## Intel Core microarchitecture. - cpuinfo_uarch_conroe = 0x00100203, ## Intel Core 2 microarchitecture on 65 nm process. - cpuinfo_uarch_penryn = 0x00100204, ## Intel Core 2 microarchitecture on 45 nm process. - cpuinfo_uarch_nehalem = 0x00100205, ## Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). - cpuinfo_uarch_sandy_bridge = 0x00100206, ## Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). - cpuinfo_uarch_ivy_bridge = 0x00100207, ## Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). - cpuinfo_uarch_haswell = 0x00100208, ## Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). - cpuinfo_uarch_broadwell = 0x00100209, ## Intel Broadwell microarchitecture. - cpuinfo_uarch_sky_lake = 0x0010020A, ## Intel Sky Lake microarchitecture. - cpuinfo_uarch_kaby_lake = 0x0010020B, ## Intel Kaby Lake microarchitecture. - cpuinfo_uarch_willamette = 0x00100300, ## Pentium 4 with Willamette, Northwood, or Foster cores. - cpuinfo_uarch_prescott = 0x00100301, ## Pentium 4 with Prescott and later cores. - cpuinfo_uarch_bonnell = 0x00100400, ## Intel Atom on 45 nm process. - cpuinfo_uarch_saltwell = 0x00100401, ## Intel Atom on 32 nm process. - cpuinfo_uarch_silvermont = 0x00100402, ## Intel Silvermont microarchitecture (22 nm out-of-order Atom). - cpuinfo_uarch_airmont = 0x00100403, ## Intel Airmont microarchitecture (14 nm out-of-order Atom). - cpuinfo_uarch_knights_ferry = 0x00100500, ## Intel Knights Ferry HPC boards. - cpuinfo_uarch_knights_corner = 0x00100501, ## Intel Knights Corner HPC boards (aka Xeon Phi). - cpuinfo_uarch_knights_landing = 0x00100502, ## Intel Knights Landing microarchitecture (second-gen MIC). - cpuinfo_uarch_knights_hill = 0x00100503, ## Intel Knights Hill microarchitecture (third-gen MIC). - cpuinfo_uarch_knights_mill = 0x00100504, ## Intel Knights Mill Xeon Phi. - cpuinfo_uarch_xscale = 0x00100600, ## Intel/Marvell XScale series. - cpuinfo_uarch_k5 = 0x00200100, ## AMD K5. - cpuinfo_uarch_k6 = 0x00200101, ## AMD K6 and alike. - cpuinfo_uarch_k7 = 0x00200102, ## AMD Athlon and Duron. - cpuinfo_uarch_k8 = 0x00200103, ## AMD Athlon 64, Opteron 64. - cpuinfo_uarch_k10 = 0x00200104, ## AMD Family 10h (Barcelona, Istambul, Magny-Cours). - cpuinfo_uarch_bulldozer = 0x00200105, ## AMD Bulldozer microarchitecture. Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs. - cpuinfo_uarch_piledriver = 0x00200106, ## AMD Piledriver microarchitecture. Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs. - cpuinfo_uarch_steamroller = 0x00200107, ## AMD Steamroller microarchitecture (Kaveri APUs). - cpuinfo_uarch_excavator = 0x00200108, ## AMD Excavator microarchitecture (Carizzo APUs). - cpuinfo_uarch_zen = 0x00200109, ## AMD Zen microarchitecture (Ryzen CPUs). - cpuinfo_uarch_geode = 0x00200200, ## NSC Geode and AMD Geode GX and LX. - cpuinfo_uarch_bobcat = 0x00200201, ## AMD Bobcat mobile microarchitecture. - cpuinfo_uarch_jaguar = 0x00200202, ## AMD Jaguar mobile microarchitecture. - cpuinfo_uarch_puma = 0x00200203, ## AMD Puma mobile microarchitecture. - cpuinfo_uarch_arm7 = 0x00300100, ## ARM7 series. - cpuinfo_uarch_arm9 = 0x00300101, ## ARM9 series. - cpuinfo_uarch_arm11 = 0x00300102, ## ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. - cpuinfo_uarch_cortex_a5 = 0x00300205, ## ARM Cortex-A5. - cpuinfo_uarch_cortex_a7 = 0x00300207, ## ARM Cortex-A7. - cpuinfo_uarch_cortex_a8 = 0x00300208, ## ARM Cortex-A8. - cpuinfo_uarch_cortex_a9 = 0x00300209, ## ARM Cortex-A9. - cpuinfo_uarch_cortex_a12 = 0x00300212, ## ARM Cortex-A12. - cpuinfo_uarch_cortex_a15 = 0x00300215, ## ARM Cortex-A15. - cpuinfo_uarch_cortex_a17 = 0x00300217, ## ARM Cortex-A17. - cpuinfo_uarch_cortex_a32 = 0x00300332, ## ARM Cortex-A32. - cpuinfo_uarch_cortex_a35 = 0x00300335, ## ARM Cortex-A35. - cpuinfo_uarch_cortex_a53 = 0x00300353, ## ARM Cortex-A53. - cpuinfo_uarch_cortex_a55 = 0x00300355, ## ARM Cortex-A55. - cpuinfo_uarch_cortex_a57 = 0x00300357, ## ARM Cortex-A57. - cpuinfo_uarch_cortex_a72 = 0x00300372, ## ARM Cortex-A72. - cpuinfo_uarch_cortex_a73 = 0x00300373, ## ARM Cortex-A73. - cpuinfo_uarch_cortex_a75 = 0x00300375, ## ARM Cortex-A75. - cpuinfo_uarch_cortex_a76 = 0x00300376, ## ARM Cortex-A76. - cpuinfo_uarch_scorpion = 0x00400100, ## Qualcomm Scorpion. - cpuinfo_uarch_krait = 0x00400101, ## Qualcomm Krait. - cpuinfo_uarch_kryo = 0x00400102, ## Qualcomm Kryo. - cpuinfo_uarch_falkor = 0x00400103, ## Qualcomm Falkor. - cpuinfo_uarch_saphira = 0x00400104, ## Qualcomm Saphira. - cpuinfo_uarch_denver = 0x00500100, ## Nvidia Denver. - cpuinfo_uarch_denver2 = 0x00500101, ## Nvidia Denver 2. - cpuinfo_uarch_carmel = 0x00500102, ## Nvidia Carmel. - cpuinfo_uarch_mongoose_m1 = 0x00600100, ## Samsung Mongoose M1 (Exynos 8890 big cores). - cpuinfo_uarch_mongoose_m2 = 0x00600101, ## Samsung Mongoose M2 (Exynos 8895 big cores). - cpuinfo_uarch_meerkat_m3 = 0x00600102, ## Samsung Meerkat M3 (Exynos 9810 big cores). - cpuinfo_uarch_swift = 0x00700100, ## Apple A6 and A6X processors. - cpuinfo_uarch_cyclone = 0x00700101, ## Apple A7 processor. - cpuinfo_uarch_typhoon = 0x00700102, ## Apple A8 and A8X processor. - cpuinfo_uarch_twister = 0x00700103, ## Apple A9 and A9X processor. - cpuinfo_uarch_hurricane = 0x00700104, ## Apple A10 and A10X processor. - cpuinfo_uarch_monsoon = 0x00700105, ## Apple A11 processor (big cores). - cpuinfo_uarch_mistral = 0x00700106, ## Apple A11 processor (little cores). - cpuinfo_uarch_thunderx = 0x00800100, ## Cavium ThunderX. - cpuinfo_uarch_thunderx2 = 0x00800200, ## Cavium ThunderX2 (originally Broadcom Vulkan). - cpuinfo_uarch_pj4 = 0x00900100, ## Marvell PJ4. - cpuinfo_uarch_brahma_b15 = 0x00A00100, ## Broadcom Brahma B15. - cpuinfo_uarch_brahma_b53 = 0x00A00101, ## Broadcom Brahma B53. - cpuinfo_uarch_xgene = 0x00B00100 ## Applied Micro X-Gene. - -{.pragma: cpuinfo_proc, importc, header: headerPath, cdecl.} - -proc cpuinfo_initialize(): bool {.cpuinfo_proc.} -proc cpuinfo_deinitialize() {.cpuinfo_proc, noconv.} # noconv for addQuitProc - -proc cpuinfo_get_processors*(): ptr CPUInfo_processor {.cpuinfo_proc.} -proc cpuinfo_get_cores*(): ptr CPUInfo_core {.cpuinfo_proc.} -proc cpuinfo_get_clusters*(): ptr CPUInfo_cluster {.cpuinfo_proc.} -proc cpuinfo_get_packages*(): ptr CPUInfo_package {.cpuinfo_proc.} -proc cpuinfo_get_l1i_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l1d_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l2_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l3_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l4_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_processor*(index: uint32): ptr CPUInfo_processor {.cpuinfo_proc.} -proc cpuinfo_get_core*(index: uint32): ptr CPUInfo_core {.cpuinfo_proc.} -proc cpuinfo_get_cluster*(index: uint32): ptr CPUInfo_cluster {.cpuinfo_proc.} -proc cpuinfo_get_package*(index: uint32): ptr CPUInfo_package {.cpuinfo_proc.} -proc cpuinfo_get_l1i_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l1d_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l2_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l3_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_l4_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.} -proc cpuinfo_get_processors_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_cores_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_clusters_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_packages_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_l1i_caches_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_l1d_caches_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_l2_caches_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_l3_caches_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_l4_caches_count*(): uint32 {.cpuinfo_proc.} -proc cpuinfo_get_current_processor*(): ptr CPUInfo_processor {.cpuinfo_proc.} -proc cpuinfo_get_current_core*(): ptr CPUInfo_core {.cpuinfo_proc.} - -########################################### -################# C files ################# - -# clog dependency -{.passC: "-I" & cpuinfoPath & "deps/clog/include".} -{.compile: cpuinfoPath & "deps/clog/src/clog.c".} - -# Headers - use the patched header with typedefs -# Also for some reason we need to passC in the same line -# Otherwise "curSrcFolder" is ignored -{.passC: "-I" & cpuinfoPath & "src -I" & curSrcFolder.} - -when defined(linux): - {.passC: "-D_GNU_SOURCE".} - {.passL: "-lpthread".} - -template compile(path: static string): untyped = - # Path: the path from cpuinfo/src folder - const compiled_object = block: - var obj_name = "cpuinfo" - for subPath in path.split(DirSep): - obj_name &= "_" & subPath - obj_name &= ".o" - obj_name - # we need to use relative paths https://github.com/nim-lang/Nim/issues/9370 - {.compile:("./cpuinfo/src/" & path, compiled_object).} - -when defined(arm) or defined(arm64): - when defined(android): - compile"arm/android/gpu.c" - compile"arm/android/properties.c" - elif defined(linux): - compile"arm/linux/aarch32-isa.c" - compile"arm/linux/aarch64-isa.c" - compile"arm/linux/chipset.c" - compile"arm/linux/clusters.c" - compile"arm/linux/cpuinfo.c" - compile"arm/linux/hwcap.c" - compile"arm/linux/init.c" - compile"arm/linux/midr.c" - elif defined(iOS): # we don't support GNU Hurd ¯\_(ツ)_/¯ - compile"arm/mach/init.c" - # iOS GPU - # compile"gpu/gles-ios.m" # TODO: Obj-C compilation - compile"arm/cache.c" - compile"arm/tlb.c" - compile"arm/uarch.c" - # ARM GPU - compile"gpu/gles2.c" - -when defined(linux): - compile"linux/cpulist.c" - compile"linux/current.c" - compile"linux/gpu.c" - compile"linux/multiline.c" - compile"linux/processors.c" - compile"linux/smallfile.c" - -when defined(iOS) or defined(macos) or defined(macosx): # # we don't support GNU Hurd ¯\_(ツ)_/¯ - compile"mach/topology.c" - -when defined(i386) or defined(amd64): - compile"x86/cache/descriptor.c" - compile"x86/cache/deterministic.c" - compile"x86/cache/init.c" - when defined(linux): - compile"x86/linux/cpuinfo.c" - compile"x86/linux/init.c" - elif defined(iOS) or defined(macos) or defined(macosx): - compile"x86/mach/init.c" - # compile"src/x86/nacl/isa.c" # TODO: NaCl support - compile"x86/info.c" - compile"x86/init.c" - compile"x86/isa.c" - compile"x86/name.c" - compile"x86/topology.c" - compile"x86/uarch.c" - compile"x86/vendor.c" - -compile"api.c" -compile"init.c" - -########################################### -################# Runtime ################# - -if not cpuinfo_initialize(): - raise newException(LibraryError, "Could not initialize the cpuinfo module") -addQuitProc(cpuinfo_deinitialize) - -{.pragma: cpuinfo, cdecl, header: headerPath.} -func cpuinfo_has_x86_sse*(): bool {.cpuinfo.} -func cpuinfo_has_x86_sse2*(): bool {.cpuinfo.} -func cpuinfo_has_x86_sse3*(): bool {.cpuinfo.} -func cpuinfo_has_x86_sse4_1*(): bool {.cpuinfo.} -func cpuinfo_has_x86_avx*(): bool {.cpuinfo.} -func cpuinfo_has_x86_avx2*(): bool {.cpuinfo.} -func cpuinfo_has_x86_avx512f*(): bool {.cpuinfo.} - -func cpuinfo_has_x86_fma3*(): bool {.cpuinfo.} diff --git a/benchmarks/matmul/README.md b/benchmarks/matmul/README.md deleted file mode 100644 index 0fc6f7e..0000000 --- a/benchmarks/matmul/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Benchmarks versus State-of-the-Art Matrix Multiplication. - -This evaluates threading performance versus state-of-the-art -implementation of matrix multiplication. - -Those implementations commonly known as GEMM (GEneralied Matrix Multiplication) -are part of a BLAS library, BLAS being a standard interface for vector and matrices linear algebra. - -Common implementations include: -- OpenBLAS (Assembly) -- Intel MKL (Assembly) -- Apple Accelerate -- BLIS -- ARM Performance library - -which are highly-tuned to use CPU L1 and L2, prefetching -SIMD vectorizations and core parallelism. - -Memory locality has a significant impact on performance -and threading frameworks should minimize cache misses and memory footprint. - -Note that the common naive benchmarking of matrix multiplication -with triple for-loop can be 100x times slower than optimized implementation -on decently sized matrices. This performance gap grows at N^3 rate, asuming -we multiply two NxN matrices. - -[Laser](https://github.com/numforge/laser) implements a state-of-the-art -GEMM that reaches performance similar to OpenBLAS in pure Nim and parallelism -is obtained through OpenMP. - -We reuse Laser implementation and switch the backend to Weave's scheduler -to measure overhead on high-performance computing workload. - -Laser code as of Aug. 24, 2019 (https://github.com/numforge/laser/tree/af191c086b4a98c49049ecf18f5519dc6856cc77) - -`laser_gemm_backend` contains backend routines with no parallelism. - -`laser_utils` contains routines that are used across all Laser primitives diff --git a/benchmarks/matmul/laser_gemm/gemm.nim b/benchmarks/matmul/laser_gemm/gemm.nim deleted file mode 100644 index a761120..0000000 --- a/benchmarks/matmul/laser_gemm/gemm.nim +++ /dev/null @@ -1,511 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ../../cpuinfo, - ../laser_utils/[compiler_optim_hints, openmp], - ../laser_gemm_backend/[ - gemm_tiling, gemm_utils, - gemm_ukernel_dispatch - ], - ./gemm_packing - -withCompilerOptimHints() - -# ############################################################ -# -# Optimized GEMM (Generalized Matrix-Multiplication) -# -# ############################################################ - -# Features -# - Arbitrary stride support -# - Efficient implementation (within 90% of the speed of OpenBLAS, more tuning to expect) -# - Parallel and scale linearly with number of cores -# -# Future -# - Implementation extended to integers -# - ARM Neon optimisation -# - Small matrix multiply optimisation -# - Pre-packing to when computing using the same matrix -# - batched matrix multiplication - -# Terminology -# - M, Matrix: Both dimension are large or unknown -# - P, Panel: one of the dimension is small -# - B, Block: both dimension are small -# -# - GEMM: GEneralized Matrix-Matrix multiplication -# - GEPP: GEneralized Panel-Panel multiplication -# - GEBP: Generalized Block-Panel multiplication (macrokernel) -# - GEBB: GEneralized Block-Block multiplication (microkernel) -# ... - -# ############################################################ -# -# GEBP Macrokernel -# -# ############################################################ - -proc gebp_mkernel*[T; ukernel: static MicroKernel]( - mc, nc, kc: int, - alpha: T, packA, packB: ptr UncheckedArray[T], - beta: T, - mcncC: MatrixView[T] - ) = - ## Macro kernel, multiply: - ## - a block A[mc, kc] * panel B[kc, N] - - # Since nr is small this the the good place to parallelize - # See: Anatomy of High-Performance Many-Threaded Matrix Multiplication - # Smith et al - # - http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf - - # ⚠ We need to ensure that loop variables and pointers - # are private to each thread - - # Nim doesn't support arbitrary increment with OpenMP - # So we store indexing/edge case data in tiles - const - MR = ukernel.extract_mr - NR = ukernel.extract_nr - PT = ukernel.extract_pt - - # ##################################### - # 4. for jr = 0,...,nc−1 in steps of nr - for jr in `||`(0, nc-1, NR, "taskloop"): - let nr = min(nc - jr, NR) # C[ic:ic+mc, jc+jr:jc+jr+nr] - - # ################################### - # 5. for ir = 0,...,m−1 in steps of mr - for ir in countup(0, mc-1, MR): - let mr = min(mc - ir, MR) - let c_aux = mcncC.stride(ir, jr) # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] - - let upanel_b = packB + jr*kc - prefetch(upanel_b, Read, ModerateTemporalLocality) - let upanel_a = packA + ir*kc - prefetch(upanel_a, Read, ModerateTemporalLocality) - - if nr == NR and mr == MR: - # General case - gebb_ukernel[T, ukernel]( # GEBB microkernel + epilogue - kc, # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] = - alpha, upanel_a, upanel_b, # αA[ic+ir:ic+ir+mr, pc:pc+kc] * - beta, c_aux # B[pc:pc+kc, jc+jr:jc+jr+nr] + - ) # βC[ic:ic+mc, jc:jc+nc] - else: - # Matrix edges - gebb_ukernel_edge[T, ukernel]( # GEBB microkernel + epilogue - mr, nr, kc, # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] = - alpha, upanel_a, upanel_b, # αA[ic+ir:ic+ir+mr, pc:pc+kc] * - beta, c_aux # B[pc:pc+kc, jc+jr:jc+jr+nr] + - ) # βC[ic:ic+mc, jc:jc+nc] - -# ########################################################################################### -# -# GEMM Internal Implementation -# -# ########################################################################################### - -proc gemm_impl[T; ukernel: static MicroKernel]( - M, N, K: int, - alpha: T, vA: MatrixView[T], vB: MatrixView[T], - beta: T, vC: MatrixView[T], - tiles: Tiles[T] - ) = - - # #################################################################### - # Loop partitioning - # - We parallelize around ic loop (partitions M dimension) - # - and jr loop (partitions N dimension) - # - # Currently the first loop nc = N is not partitioned. - # According to BLIS paper, it should be partitioned at socket level. - # This can be done with OpenMP using - # - # omp_set_nested(1); - # n_sockets = omp_get_num_places(); - # #pragma omp parallel num_threads(n_sockets) proc_bind(spread) - # { - # n_procs = omp_get_place_num_procs(omp_get_num_places()); - # #pragma omp parallel num_threads(n_procs) proc_bind(close) - # doStuff(); - # } - - # Hyperthreading will pollute the L1, L2 caches and the TLB - # as we intentionally choose parameters so that about - # half of the core caches is taken by micropanels of A and B. - # But somehow fixing num_threads to anything other than my number of logical threads - # kills my perf (and even also OpenBLAS when it's run at the same time) - - const PT = ukernel.extract_pt - let parallelize = M*N*K > PT*PT*PT - # let nb_threads = cpuinfo_get_cores_count() # get physical cores - - # #################################################################### - # 1. for jc = 0,...,n−1 in steps of nc - let nc = N # B[0:K, jc:jc+nc] - # C[0:M, jc:jc+nc] - # ###################################### - # 2. for pc = 0,...,k−1 in steps of kc - for pc in countup(0, K-1, tiles.kc): - prefetch(tiles.b, Write, LowTemporalLocality) - let kc = min(K - pc, tiles.kc) # Deal with edges # A[0:M, pc:pc+kc] - - let kcncB = vB.stride(pc, 0) # B[pc:pc+kc, jc:jc+nc] - pack_B_kc_nc[T, ukernel](tiles.b, kc, nc, kcncB) # PackB panel [kc, nc] (nc is large or unknown) - - # First time writing to C, we scale it, otherwise accumulate - let beta = if pc == 0: beta else: 1.T - - omp_parallel_if(parallelize): - # #################################### - # 3. for ic = 0,...,m−1 in steps of mc - omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true): - let packA = tiles.a + icb * tiles.upanelA_size - prefetch(packA, Write, LowTemporalLocality) - let ic = icb * tiles.mc - let mc = min(M-ic, tiles.mc) # C[ic:ic+mc, jc:jc+nc] - - let mckcA = vA.stride(ic, pc) # A[ic:ic+mc, pc:pc+kc] - pack_A_mc_kc[T, ukernel](packA, mc, kc, mckcA) # PackA block [mc, kc] - - gebp_mkernel[T, ukernel]( # GEBP macrokernel: - mc, nc, kc, # C[ic:ic+mc, jc:jc+nc] = - alpha, packA, tiles.b, # αA[ic:ic+mc, pc:pc+kc] * B[pc:pc+kc, jc:jc+nc] + - beta, vC.stride(ic, 0) # βC[ic:ic+mc, jc:jc+nc] - ) - -# ############################################################ -# -# Exported function and dispatch with CPU runtime detection -# -# ############################################################ - -proc gemm_strided*[T: SomeNumber]( - M, N, K: int, - alpha: T, - A: ptr T, - rowStrideA, colStrideA: int, - B: ptr T, - rowStrideB, colStrideB: int, - beta: T, - C: ptr T, - rowStrideC, colStrideC: int) = - - # TODO: shortcut alpha = 0 or K = 0 - # TODO: elementwise epilogue fusion like relu/tanh/sigmoid - # TODO: shortcut for small gemm - - # Create a view to abstract deling with strides - # and passing those in each proc - let vA = A.toMatrixView(rowStrideA, colStrideA) - let vB = B.toMatrixView(rowStrideB, colStrideB) - let vC = C.toMatrixView(rowStrideC, colStrideC) - - # Cache hierarchy: - # - block C: mr*nr registers - # - block B: kc*nr L1 cache - # - block A: mc*kc L2 cache - # - panel B: kc*nc L3 cache - - template dispatch(cpu_features: static CPUFeatureX86): untyped{.dirty.} = - template apply(ukernel: MicroKernel): untyped {.dirty.} = - let tiles = ukernel.newTiles(T, M, N, K) - gemm_impl[T, ukernel]( - M, N, K, - alpha, vA, vB, - beta, vC, - tiles - ) - return - if colStrideC == 1: - const ukernel = cpu_features.x86_ukernel(T, true) - apply(ukernel) - else: - const ukernel = cpu_features.x86_ukernel(T, false) - apply(ukernel) - - when defined(i386) or defined(amd64): - when T is float32: - if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512) - elif cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA) - elif cpuinfo_has_x86_avx(): dispatch(x86_AVX) - elif cpuinfo_has_x86_sse(): dispatch(x86_SSE) - elif T is float64: - if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512) - elif cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA) - elif cpuinfo_has_x86_avx(): dispatch(x86_AVX) - elif cpuinfo_has_x86_sse2(): dispatch(x86_SSE2) - elif T is int32 or T is uint32: - if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512) - elif cpuinfo_has_x86_avx2(): dispatch(x86_AVX2) - elif cpuinfo_has_x86_sse41(): dispatch(x86_SSE4_1) - elif cpuinfo_has_x86_sse2(): dispatch(x86_SSE2) - elif T is int64: - if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512) - elif cpuinfo_has_x86_sse2(): dispatch(x86_SSE2) - dispatch(x86_Generic) - -# ############################################################ -# -# Private tests -# -# ############################################################ - -when isMainModule: - # Tests - block: - let a = [[1.0, 2, 3], - [1.0, 1, 1], - [1.0, 1, 1]] - - let b = [[1.0, 1], - [1.0, 1], - [1.0, 1]] - - let ab = [[6.0, 6], - [3.0, 3], - [3.0, 3]] - - var res_ab: array[3, array[2, float]] - gemm_strided( - 3, 2, 3, - 1.0, a[0][0].unsafeAddr, 3, 1, - b[0][0].unsafeAddr, 2, 1, - 0.0, res_ab[0][0].addr, 2, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - let a = [[1.0, 2, 3], - [4.0, 5, 6], - [7.0, 8, 9]] - - let b = [[1.0, 1], - [1.0, 1], - [1.0, 1]] - - let ab = [[ 6.0, 6], - [15.0, 15], - [24.0, 24]] - - var res_ab: array[3, array[2, float]] - gemm_strided( - 3, 2, 3, - 1.0, a[0][0].unsafeAddr, 3, 1, - b[0][0].unsafeAddr, 2, 1, - 0.0, res_ab[0][0].addr, 2, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - let a = [[1.0,2,3], - [4.0,5,6]] - - let b = [[7.0, 8], - [9.0, 10], - [11.0,12]] - - let ab = [[ 58.0, 64], - [139.0,154]] - - var res_ab: array[2, array[2, float]] - gemm_strided( - 2, 2, 3, - 1.0, a[0][0].unsafeAddr, 3, 1, - b[0][0].unsafeAddr, 2, 1, - 0.0, res_ab[0][0].addr, 2, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - # example from http://www.intmath.com/matrices-determinants/matrix-multiplication-examples.php - echo "\n## (M x K) * (K x N) with M < N" - let a = [[-2,-3,-1], - [ 3, 0, 4]] - let b = [[ 1, 5, 2,-1], - [-3, 0, 3, 4], - [ 6,-2, 7,-4]] - - let ab = [[ 1,-8,-20, -6], - [27, 7, 34,-19]] - - var res_ab: array[2, array[4, int]] - gemm_strided( - 2, 4, 3, - 1, a[0][0].unsafeAddr, 3, 1, - b[0][0].unsafeAddr, 4, 1, - 0, res_ab[0][0].addr, 4, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - # from http://www.calcul.com/show/calculator/matrix-multiplication_;5;5;5;5?matrix1=[[%225%22,%226%22,%225%22,%228%22],[%228%22,%222%22,%228%22,%228%22],[%220%22,%225%22,%224%22,%220%22],[%224%22,%220%22,%225%22,%226%22],[%224%22,%225%22,%220%22,%223%22]]&matrix2=[[%225%22,%223%22,%226%22,%220%22],[%225%22,%222%22,%223%22,%223%22],[%228%22,%228%22,%222%22,%220%22],[%227%22,%227%22,%220%22,%220%22]]&operator=* - echo "\n## (M x K) * (K x N) with M > N and M > block-size (4x4)" - let a = [[5,6,5,8], - [8,2,8,8], - [0,5,4,0], - [4,0,5,6], - [4,5,0,3]] - let b = [[5,3,6,0], - [5,2,3,3], - [8,8,2,0], - [7,7,0,0]] - - let ab = [[151,123,58,18], - [170,148,70, 6], - [ 57, 42,23,15], - [102, 94,34, 0], - [ 66, 43,39,15]] - - var res_ab: array[5, array[4, int]] - gemm_strided( - 5, 4, 4, - 1, a[0][0].unsafeAddr, 4, 1, - b[0][0].unsafeAddr, 4, 1, - 0, res_ab[0][0].addr, 4, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - let a = [[2, 4, 3, 1, 3, 1, 3, 1], - [4, 3, 2, 4, 1, 0, 0, 0]] - - - let b = [[2, 2], - [2, 1], - [0, 3], - [0, 1], - [0, 2], - [4, 3], - [3, 3], - [2, 1]] - - let ab = [[27,37], - [14,23]] - - var res_ab: array[2, array[2, int]] - gemm_strided( - 2, 2, 8, - 1, a[0][0].unsafeAddr, 8, 1, - b[0][0].unsafeAddr, 2, 1, - 0, res_ab[0][0].addr, 2, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - let a = [[2, 1], - [1, 3], - [2, 1], - [1, 0], - [3, 4], - [2, 4], - [3, 1], - [4, 0]] - - - let b = [[2, 2, 0, 4, 0, 0, 4, 2], - [2, 1, 2, 1, 2, 4, 4, 1]] - - let ab = [[ 6, 5, 2, 9, 2, 4, 12, 5], - [ 8, 5, 6, 7, 6, 12, 16, 5], - [ 6, 5, 2, 9, 2, 4, 12, 5], - [ 2, 2, 0, 4, 0, 0, 4, 2], - [14, 10, 8, 16, 8, 16, 28, 10], - [12, 8, 8, 12, 8, 16, 24, 8], - [ 8, 7, 2, 13, 2, 4, 16, 7], - [ 8, 8, 0, 16, 0, 0, 16, 8]] - - var res_ab: array[8, array[8, int]] - gemm_strided( - 8, 8, 2, - 1, a[0][0].unsafeAddr, 2, 1, - b[0][0].unsafeAddr, 8, 1, - 0, res_ab[0][0].addr, 8, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - block: - # from http://www.calcul.com/show/calculator/matrix-multiplication?matrix1=[[%222%22,%224%22,%223%22,%221%22,%223%22,%221%22,%223%22,%221%22],[%221%22,%222%22,%221%22,%221%22,%222%22,%220%22,%224%22,%223%22],[%222%22,%220%22,%220%22,%223%22,%220%22,%224%22,%224%22,%221%22],[%221%22,%221%22,%224%22,%220%22,%223%22,%221%22,%223%22,%220%22],[%223%22,%224%22,%221%22,%221%22,%224%22,%222%22,%223%22,%224%22],[%222%22,%224%22,%220%22,%222%22,%223%22,%223%22,%223%22,%224%22],[%223%22,%220%22,%220%22,%223%22,%221%22,%224%22,%223%22,%221%22],[%224%22,%223%22,%222%22,%224%22,%221%22,%220%22,%220%22,%220%22]]&matrix2=[[%222%22,%222%22,%220%22,%224%22,%220%22,%220%22,%224%22,%222%22],[%222%22,%220%22,%220%22,%221%22,%221%22,%221%22,%223%22,%221%22],[%220%22,%222%22,%222%22,%220%22,%222%22,%222%22,%223%22,%223%22],[%220%22,%220%22,%221%22,%220%22,%224%22,%222%22,%224%22,%221%22],[%220%22,%220%22,%221%22,%223%22,%224%22,%222%22,%224%22,%222%22],[%224%22,%223%22,%224%22,%221%22,%224%22,%224%22,%220%22,%223%22],[%223%22,%223%22,%220%22,%222%22,%221%22,%222%22,%223%22,%223%22],[%222%22,%221%22,%222%22,%221%22,%222%22,%224%22,%224%22,%221%22]]&operator=* - echo "\n## (N x N) * (N x N) with N multiple of block size" - - let a = [[2, 4, 3, 1, 3, 1, 3, 1], - [1, 2, 1, 1, 2, 0, 4, 3], - [2, 0, 0, 3, 0, 4, 4, 1], - [1, 1, 4, 0, 3, 1, 3, 0], - [3, 4, 1, 1, 4, 2, 3, 4], - [2, 4, 0, 2, 3, 3, 3, 4], - [3, 0, 0, 3, 1, 4, 3, 1], - [4, 3, 2, 4, 1, 0, 0, 0]] - - - let b = [[2, 2, 0, 4, 0, 0, 4, 2], - [2, 0, 0, 1, 1, 1, 3, 1], - [0, 2, 2, 0, 2, 2, 3, 3], - [0, 0, 1, 0, 4, 2, 4, 1], - [0, 0, 1, 3, 4, 2, 4, 2], - [4, 3, 4, 1, 4, 4, 0, 3], - [3, 3, 0, 2, 1, 2, 3, 3], - [2, 1, 2, 1, 2, 4, 4, 1]] - - let ab = [[27,23,16,29,35,32,58,37], - [24,19,11,23,26,30,49,27], - [34,29,21,21,34,34,36,32], - [17,22,15,21,28,25,40,33], - [39,27,23,40,45,46,72,41], - [41,26,25,34,47,48,65,38], - [33,28,22,26,37,34,41,33], - [14,12, 9,22,27,17,51,23]] - - var res_ab: array[8, array[8, int]] - gemm_strided( - 8, 8, 8, - 1, a[0][0].unsafeAddr, 8, 1, - b[0][0].unsafeAddr, 8, 1, - 0, res_ab[0][0].addr, 8, 1 - ) - - # echo "expected: ", ab - # echo "result: ", res_ab - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" diff --git a/benchmarks/matmul/laser_gemm/gemm_packing.nim b/benchmarks/matmul/laser_gemm/gemm_packing.nim deleted file mode 100644 index ceed386..0000000 --- a/benchmarks/matmul/laser_gemm/gemm_packing.nim +++ /dev/null @@ -1,94 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -# Due to issue with "static MicroKernel" as parameter -# as of 0.19.9 we pass it as a generic param -# - 1. undeclared identifier mr/nr, when accessing ukernel -# - 2. object constructor needs and object type when workaround first issue with macro - -import - ../laser_utils/compiler_optim_hints, - ../laser_utils/align_unroller, - ../laser_gemm_backend/[gemm_utils, gemm_tiling] - -withCompilerOptimHints() - -# ############################################################ -# -# Packing A -# -# ############################################################ - -proc pack_A_mc_kc*[T; ukernel: static MicroKernel]( - packedA: ptr UncheckedArray[T], - mc, kc: int, - A: MatrixView[T]) = - ## Packs panel [kc, mc] into buffer à (size ~half-L2 cache) - ## Pads if needed - ## Note that A is of shape [M, K] so it is transposed. - ## - ## Concretely the outer dimension of packed matrices - ## is k so that C[i, j] = A[i, k] * B[k, j] - ## does not require strided access - let buffer{.restrict.} = assume_aligned packedA - const MR = ukernel.extract_mr() - let unroll_stop = mc.round_step_down(MR) - - # 1. Pack m matrices of size kc*mr, m = mc/mr - {.emit:""" - for (int i = 0; i < `unroll_stop`; i+=`MR`) - for (int k = 0; k < `kc`; k++) - for (int ii = 0; ii < `MR`; ii++) - `buffer`[i*`kc`+k*`MR`+ii] = `A`.buffer[(i+ii)*`A`.rowStride + k*`A`.colStride]; - """.} - - # 2. Process the tail - let remainder = mc - unroll_stop - if remainder > 0: - let offBuf = buffer + kc*unroll_stop - for k in 0 ..< kc: - for i in 0 ..< remainder: - offBuf[k*MR + i] = A[unroll_stop+i, k] - for i in remainder ..< MR: # Pad with 0 if packing over the edge - offBuf[k*MR + i] = 0.T - -# ############################################################ -# -# Packing B -# -# ############################################################ - -proc pack_B_kc_nc*[T; ukernel: static MicroKernel]( - packedB: ptr UncheckedArray[T], - kc, nc: int, - B: MatrixView[T]) = - ## Packs panel [kc, nc] for ~B (half-L1 cache) - ## Pads if needed - ## - ## Concretely the outer dimension of packed matrices - ## is k so that C[i, j] = A[i, k] * B[k, j] - ## does not require strided access - let buffer{.restrict.} = assume_aligned packedB - const NR = ukernel.extract_nr() - let unroll_stop = nc.round_step_down(NR) - - # 1. Pack n matrices of size kc*nr, n = nc/nr - {.emit:""" - #pragma omp parallel for - for (int j = 0; j < `unroll_stop`; j+=`NR`) - for (int k = 0; k < `kc`; k++) - for (int jj = 0; jj < `NR`; jj++) - `buffer`[j*`kc`+k*`NR`+jj] = `B`.buffer[k*`B`.rowStride + (j+jj)*`B`.colStride]; - """.} - - # 2. Process the tail - let remainder = nc - unroll_stop - if remainder > 0: - let offBuf = buffer + kc*unroll_stop - for k in 0 ..< kc: - for j in 0 ..< remainder: - offBuf[k*NR + j] = B[k, unroll_stop+j] - for j in remainder ..< NR: # Pad with 0 if packing over the edge - offBuf[k*NR + j] = 0.T diff --git a/benchmarks/matmul/laser_gemm/gemm_prepacked.nim b/benchmarks/matmul/laser_gemm/gemm_prepacked.nim deleted file mode 100644 index 65850bd..0000000 --- a/benchmarks/matmul/laser_gemm/gemm_prepacked.nim +++ /dev/null @@ -1,525 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ../../cpuinfo, - ../laser_utils/[compiler_optim_hints, openmp, align_unroller], - ../laser_utils/laser_gemm_backend/[ - gemm_tiling, gemm_utils, gemm_packing, - gemm_ukernel_dispatch, gemm - ] - -withCompilerOptimHints() - -# ############################################################ -# -# GEMM Prepacked Matrices A and B -# -# ############################################################ - -template dispatch( - return_void: static bool, - func_call: untyped): untyped {.dirty.} = - ## Warning: statements after dispatch are unreachable - template dispatch_opt(cpu_features: static CPUFeatureX86): untyped {.dirty.} = - ## Dispatch depending on detected CPU features. - type A = T # workaround "Cannot evaluate at compile-time - # c_unit_stride is not relevant here - const ukernel = cpu_features.x86_ukernel(A, c_unit_stride = false) - - when return_void: - func_call - return - else: - return func_call - - when defined(i386) or defined(amd64): - when T is float32: - if cpuinfo_has_x86_avx512f(): dispatch_opt(x86_AVX512) - elif cpuinfo_has_x86_fma3(): dispatch_opt(x86_AVX_FMA) - elif cpuinfo_has_x86_avx(): dispatch_opt(x86_AVX) - elif cpuinfo_has_x86_sse(): dispatch_opt(x86_SSE) - elif T is float64: - if cpuinfo_has_x86_avx512f(): dispatch_opt(x86_AVX512) - elif cpuinfo_has_x86_fma3(): dispatch_opt(x86_AVX_FMA) - elif cpuinfo_has_x86_avx(): dispatch_opt(x86_AVX) - elif cpuinfo_has_x86_sse2(): dispatch_opt(x86_SSE2) - elif T is int32 or T is uint32: - if cpuinfo_has_x86_avx512f(): dispatch_opt(x86_AVX512) - elif cpuinfo_has_x86_avx2(): dispatch_opt(x86_AVX2) - elif cpuinfo_has_x86_sse41(): dispatch_opt(x86_SSE4_1) - elif cpuinfo_has_x86_sse2(): dispatch_opt(x86_SSE2) - elif T is int64: - if cpuinfo_has_x86_avx512f(): dispatch_opt(x86_AVX512) - elif cpuinfo_has_x86_sse2(): dispatch_opt(x86_SSE2) - dispatch_opt(x86_Generic) - -# ############################################################ -# -# Packing B -# -# ############################################################ - -func gemm_prepackB_mem_required_impl*( - ukernel: static MicroKernel, - T: typedesc, - M, N, K: int): int = - - let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K) - const NR = ukernel.nr - - let pc_num_iter = get_num_tiles(K, KC) - let upanelB_size = KC * round_step_up(NC, NR) - - result = T.sizeof * upanelB_size * pc_num_iter - -func gemm_prepackB_mem_required*( - T: type, - M, N, K: int): int = - ## Returns the amount of memory that needs to be preallocated - ## to pack matrix B. - - dispatch(return_void = false): - gemm_prepackB_mem_required_impl( - ukernel, T, M, N, K - ) - -proc gemm_prepackB_impl[T; ukernel: static MicroKernel]( - dst: ptr UncheckedArray[T], - M, N, K: int, - vB: MatrixView[T] - ) = - - let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K) - let pc_num_iter = get_num_tiles(K, KC) - let upanelB_size = KC * round_step_up(NC, ukernel.nr) - for pcb in 0||(pc_num_iter-1): - let packB = dst + pcb * upanelB_size - prefetch(packB, Write, LowTemporalLocality) - - let pc = pcb * KC - let kc = min(K - pc, KC) - let kcncB = vB.stride(pc, 0) - - # Note: pack_B also creates a parallel region - # this will cause issues if omp_get_nested = 1 - pack_B_kc_nc[T, ukernel]( - packB, - kc, NC, kcncB - ) - -proc gemm_prepackB*[T]( - dst_packedB: ptr (T or UncheckedArray[T]), - M, N, K: int, - src_B: ptr T, rowStrideB, colStrideB: int) = - ## Prepack matrix B of shape KxN - ## and strides rowStrideB and colStrideB - ## for matrix multiplication. - ## B must be 64-bit aligned. - ## - ## For optimal performance packing is machine and architecture dependent - ## i.e. it depends on detected features like AVX and number of cores - ## and may depend on your machine cache sizes in the future. - ## It is unsafe to store or serialize it. - - doAssert (cast[int](dst_packedB) and 63) == 0, "The destination pointer must be 64-bit aligned" - - let vB = src_B.toMatrixView(rowStrideB, colStrideB) - let dst = cast[ptr UncheckedArray[T]](dst_packedB) - - dispatch(return_void = true): - gemm_prepackB_impl[T, ukernel]( - dst, - M, N, K, - vB - ) - -# ############################################################ -# -# Packing A -# -# ############################################################ - -func gemm_prepackA_mem_required_impl*( - ukernel: static MicroKernel, - T: typedesc, - M, N, K: int): int = - - let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K) - const MR = ukernel.mr - - let pc_num_iter = get_num_tiles(K, KC) - let ic_num_iter = get_num_tiles(M, MC) - let upanelA_size = KC * round_step_up(MC, MR) - - result = T.sizeof * upanelA_size * pc_num_iter * ic_num_iter - -func gemm_prepackA_mem_required*( - T: typedesc, - M, N, K: int): int = - ## Returns the amount of memory that needs to be preallocated - ## to pack matrix B. - - dispatch(return_void = false): - gemm_prepackA_mem_required_impl( - ukernel, T, M, N, K - ) - -proc gemm_prepackA_impl[T; ukernel: static MicroKernel]( - dst: ptr UncheckedArray[T], - M, N, K: int, - vA: MatrixView[T] - ) = - - let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K) - const MR = ukernel.mr - - let pc_num_iter = get_num_tiles(K, KC) - let ic_num_iter = get_num_tiles(M, MC) - let upanelA_size = KC * round_step_up(MC, MR) - - for pcb in 0||(pc_num_iter-1): - let pc = pcb * KC - let kc = min(K - pc, KC) - - for icb in 0 ..< ic_num_iter: - let packA = dst + pc*pc_num_iter + icb*upanelA_size - prefetch(packA, Write, LowTemporalLocality) - let ic = icb * MC - let mc = min(M-ic, MC) - - let mckcA = vA.stride(ic, pc) - pack_A_mc_kc[T, ukernel](packA, mc, kc, mckcA) - -proc gemm_prepackA*[T]( - dst_packedA: ptr (T or UncheckedArray[T]), - M, N, K: int, - src_A: ptr T, rowStrideA, colStrideA: int) = - ## Prepack matrix A of shape MxK - ## and strides rowStrideA and colStrideA - ## for matrix multiplication. - ## A must be 64-bit aligned. - ## - ## For optimal performance packing is machine and architecture dependent - ## i.e. it depends on detected features like AVX and number of cores - ## and may depend on your machine cache sizes in the future. - ## It is unsafe to store or serialize it. - - doAssert (cast[int](dst_packedA) and 63) == 0, "The destination pointer must be 64-bit aligned" - - let vA = src_A.toMatrixView(rowStrideA, colStrideA) - let dst = cast[ptr UncheckedArray[T]](dst_packedA) - - dispatch(return_void = true): - gemm_prepackA_impl[T, ukernel]( - dst, - M, N, K, - vA - ) - -# ############################################################ -# -# Prepacked GEMM -# -# ############################################################ - -proc gemm_packed_impl[T]( - ukernel: static MicroKernel, - M, N, K: int, - alpha: T, packedA, packedB: ptr (T or UncheckedArray[T]), - beta: T, vC: MatrixView[T] - ) = - - withCompilerOptimHints() - - const - MR = ukernel.mr - NR = ukernel.nr - PT = ukernel.pt - - let - parallelize = M*N*K > PT*PT*PT - - (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K) - pc_num_iter = get_num_tiles(K, KC) - ic_num_iter = get_num_tiles(M, MC) - - upanelB_size = KC * round_step_up(NC, NR) - upanelA_size = KC * round_step_up(MC, MR) - - - # ###################################### - # 2. for pc = 0,...,k−1 in steps of kc - for pcb in 0 ..< pc_num_iter: - let packedB{.restrict.} = cast[ptr UncheckedArray[T]](packedB + pcb * upanelB_size) - let pc = pcb * KC - let kc = min(K - pc, KC) - - # First time writing to C, we scale it, otherwise accumulate - let beta = if pc == 0: beta else: 1.T - - omp_parallel_if(parallelize): - # #################################### - # 3. for ic = 0,...,m−1 in steps of mc - omp_for(icb, ic_num_iter, use_simd=false, nowait=true): - let packedA{.restrict.} = cast[ptr UncheckedArray[T]](packedA + icb * upanelA_size) - let ic = icb * MC - let mc = min(M-ic, MC) - - gebp_mkernel[T, ukernel]( - mc, NC, kc, - alpha, packedA, packedB, - beta, vc.stride(ic, 0) - ) - -proc gemm_packed*[T: SomeNumber]( - M, N, K: int, - alpha: T, - packedA: ptr (T or UncheckedArray[T]), - packedB: ptr (T or UncheckedArray[T]), - beta: T, - C: ptr (T or UncheckedArray[T]), - rowStrideC, colStrideC: int) = - - let vC = C.toMatrixView(rowStrideC, colStrideC) - - dispatch(return_void = true): - # TODO - dispatch specialization when C is unit strided - ukernel.gemm_packed_impl( - M, N, K, - alpha, packedA, packedB, - beta, vC - ) - -# ############################################################ -# -# Private tests -# -# ############################################################ - -when isMainModule: - - import - ../../tensor/[allocator, datatypes, initialization], - strformat - - proc toPtr*[T](t: Tensor[T]): ptr T = - cast[ptr T](t.unsafe_raw_data) - - proc `$`[T](t: Tensor[T]): string = - var tmp = newSeq[T](t.size) - copyMem(tmp[0].addr, cast[ptr T](t.unsafe_raw_data), t.size * sizeof(T)) - result = $tmp - - proc pack_and_test[M, N, K: static int; T]( - a: array[M, array[K, T]], - b: array[K, array[N, T]], - ab: array[M, array[N, T]] - ) = - echo "M: ", M - echo "N: ", N - echo "K: ", K - echo fmt"A [{M}x{K}] * B[{K}x{N}] -> C[{M}x{N}]" - let packedA_size = gemm_prepackA_mem_required(T, M, N, K) - var packA = newTensor[T](packedA_size) - gemm_prepackA( - packA.toPtr, - M, N, K, - a[0][0].unsafeAddr, - K, 1 - ) - # echo packA - - let packedB_size = gemm_prepackB_mem_required(T, M, N, K) - var packB = newTensor[T](packedB_size) - gemm_prepackB( - packB.toPtr, - M, N, K, - b[0][0].unsafeAddr, - N, 1 - ) - # echo packB - - var res_ab: array[M, array[N, T]] - gemm_packed( - M, N, K, - T(1), packA.toPtr, packB.toPtr, - T(0), res_ab[0][0].addr, N, 1 - ) - - doAssert res_ab == ab, $res_ab - echo "SUCCESS\n" - - # Tests - block: - let a = [[1.0, 2, 3], - [4.0, 5, 6], - [7.0, 8, 9]] - - let b = [[1.0, 2, 3], - [4.0, 5, 6], - [7.0, 8, 9]] - - let ab = [[30.0, 36, 42], - [66.0, 81, 96], - [102.0, 126, 150]] - - pack_and_test(a, b, ab) - - block: - let a = [[1.0, 2, 3], - [1.0, 1, 1], - [1.0, 1, 1]] - - let b = [[1.0, 1], - [1.0, 1], - [1.0, 1]] - - let ab = [[6.0, 6], - [3.0, 3], - [3.0, 3]] - - pack_and_test(a, b, ab) - - block: - let a = [[1.0, 2, 3], - [4.0, 5, 6], - [7.0, 8, 9]] - - let b = [[1.0, 1], - [1.0, 1], - [1.0, 1]] - - let ab = [[ 6.0, 6], - [15.0, 15], - [24.0, 24]] - - pack_and_test(a, b, ab) - - block: - let a = [[1.0,2,3], - [4.0,5,6]] - - let b = [[7.0, 8], - [9.0, 10], - [11.0,12]] - - let ab = [[ 58.0, 64], - [139.0,154]] - - pack_and_test(a, b, ab) - - block: - # example from http://www.intmath.com/matrices-determinants/matrix-multiplication-examples.php - echo "\n## (M x K) * (K x N) with M < N" - let a = [[-2,-3,-1], - [ 3, 0, 4]] - let b = [[ 1, 5, 2,-1], - [-3, 0, 3, 4], - [ 6,-2, 7,-4]] - - let ab = [[ 1,-8,-20, -6], - [27, 7, 34,-19]] - - pack_and_test(a, b, ab) - - block: - # from http://www.calcul.com/show/calculator/matrix-multiplication_;5;5;5;5?matrix1=[[%225%22,%226%22,%225%22,%228%22],[%228%22,%222%22,%228%22,%228%22],[%220%22,%225%22,%224%22,%220%22],[%224%22,%220%22,%225%22,%226%22],[%224%22,%225%22,%220%22,%223%22]]&matrix2=[[%225%22,%223%22,%226%22,%220%22],[%225%22,%222%22,%223%22,%223%22],[%228%22,%228%22,%222%22,%220%22],[%227%22,%227%22,%220%22,%220%22]]&operator=* - echo "\n## (M x K) * (K x N) with M > N and M > block-size (4x4)" - let a = [[5,6,5,8], - [8,2,8,8], - [0,5,4,0], - [4,0,5,6], - [4,5,0,3]] - let b = [[5,3,6,0], - [5,2,3,3], - [8,8,2,0], - [7,7,0,0]] - - let ab = [[151,123,58,18], - [170,148,70, 6], - [ 57, 42,23,15], - [102, 94,34, 0], - [ 66, 43,39,15]] - - pack_and_test(a, b, ab) - - block: - let a = [[2, 4, 3, 1, 3, 1, 3, 1], - [4, 3, 2, 4, 1, 0, 0, 0]] - - - let b = [[2, 2], - [2, 1], - [0, 3], - [0, 1], - [0, 2], - [4, 3], - [3, 3], - [2, 1]] - - let ab = [[27,37], - [14,23]] - - pack_and_test(a, b, ab) - - block: - let a = [[2, 1], - [1, 3], - [2, 1], - [1, 0], - [3, 4], - [2, 4], - [3, 1], - [4, 0]] - - - let b = [[2, 2, 0, 4, 0, 0, 4, 2], - [2, 1, 2, 1, 2, 4, 4, 1]] - - let ab = [[ 6, 5, 2, 9, 2, 4, 12, 5], - [ 8, 5, 6, 7, 6, 12, 16, 5], - [ 6, 5, 2, 9, 2, 4, 12, 5], - [ 2, 2, 0, 4, 0, 0, 4, 2], - [14, 10, 8, 16, 8, 16, 28, 10], - [12, 8, 8, 12, 8, 16, 24, 8], - [ 8, 7, 2, 13, 2, 4, 16, 7], - [ 8, 8, 0, 16, 0, 0, 16, 8]] - - pack_and_test(a, b, ab) - - block: - # from http://www.calcul.com/show/calculator/matrix-multiplication?matrix1=[[%222%22,%224%22,%223%22,%221%22,%223%22,%221%22,%223%22,%221%22],[%221%22,%222%22,%221%22,%221%22,%222%22,%220%22,%224%22,%223%22],[%222%22,%220%22,%220%22,%223%22,%220%22,%224%22,%224%22,%221%22],[%221%22,%221%22,%224%22,%220%22,%223%22,%221%22,%223%22,%220%22],[%223%22,%224%22,%221%22,%221%22,%224%22,%222%22,%223%22,%224%22],[%222%22,%224%22,%220%22,%222%22,%223%22,%223%22,%223%22,%224%22],[%223%22,%220%22,%220%22,%223%22,%221%22,%224%22,%223%22,%221%22],[%224%22,%223%22,%222%22,%224%22,%221%22,%220%22,%220%22,%220%22]]&matrix2=[[%222%22,%222%22,%220%22,%224%22,%220%22,%220%22,%224%22,%222%22],[%222%22,%220%22,%220%22,%221%22,%221%22,%221%22,%223%22,%221%22],[%220%22,%222%22,%222%22,%220%22,%222%22,%222%22,%223%22,%223%22],[%220%22,%220%22,%221%22,%220%22,%224%22,%222%22,%224%22,%221%22],[%220%22,%220%22,%221%22,%223%22,%224%22,%222%22,%224%22,%222%22],[%224%22,%223%22,%224%22,%221%22,%224%22,%224%22,%220%22,%223%22],[%223%22,%223%22,%220%22,%222%22,%221%22,%222%22,%223%22,%223%22],[%222%22,%221%22,%222%22,%221%22,%222%22,%224%22,%224%22,%221%22]]&operator=* - echo "\n## (N x N) * (N x N) with N multiple of block size" - - let a = [[2, 4, 3, 1, 3, 1, 3, 1], - [1, 2, 1, 1, 2, 0, 4, 3], - [2, 0, 0, 3, 0, 4, 4, 1], - [1, 1, 4, 0, 3, 1, 3, 0], - [3, 4, 1, 1, 4, 2, 3, 4], - [2, 4, 0, 2, 3, 3, 3, 4], - [3, 0, 0, 3, 1, 4, 3, 1], - [4, 3, 2, 4, 1, 0, 0, 0]] - - - let b = [[2, 2, 0, 4, 0, 0, 4, 2], - [2, 0, 0, 1, 1, 1, 3, 1], - [0, 2, 2, 0, 2, 2, 3, 3], - [0, 0, 1, 0, 4, 2, 4, 1], - [0, 0, 1, 3, 4, 2, 4, 2], - [4, 3, 4, 1, 4, 4, 0, 3], - [3, 3, 0, 2, 1, 2, 3, 3], - [2, 1, 2, 1, 2, 4, 4, 1]] - - let ab = [[27,23,16,29,35,32,58,37], - [24,19,11,23,26,30,49,27], - [34,29,21,21,34,34,36,32], - [17,22,15,21,28,25,40,33], - [39,27,23,40,45,46,72,41], - [41,26,25,34,47,48,65,38], - [33,28,22,26,37,34,41,33], - [14,12, 9,22,27,17,51,23]] - - pack_and_test(a, b, ab) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim b/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim deleted file mode 100644 index 6d6c885..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim +++ /dev/null @@ -1,344 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -# ############################################################ -# -# Cache and register optimizations -# -# ############################################################ - -# Papers: -# [1] Anatomy of High-Performance Matrix Multiplication (Revised) -# Kazushige Goto, Robert A. Van de Geijn -# - http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf -# -# [2] Anatomy of High-Performance Many-Threaded Matrix Multiplication -# Smith et al -# - http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf -# -# [3] Automating the Last-Mile for High Performance Dense Linear Algebra -# Veras et al -# - https://arxiv.org/pdf/1611.08035.pdf -# -# [4] GEMM: From Pure C to SSE Optimized Micro Kernels -# Michael Lehn -# - http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/index.html -# -# Laser wiki - GEMM optimization resources -# - https://github.com/numforge/laser/wiki/GEMM-optimization-resources - -import - ../../cpuinfo, - ../laser_utils/[ - compiler_optim_hints, - memory, align_unroller, - ], - typetraits, macros, - ./gemm_utils - -# ############################################################ -# -# Microkernel (µkernel) -# -# ############################################################ - -# We have to take into account vectorisation -# so that the microkernel can be processed with vectorized intrinsics. -# -# Caux [mr, nr] must stay in register. -# - mr ~= nr is optimal to amortize register load cost -# - some registers must be left to prefetch à and ~B (PackedA and PackedB) -# - nr >= (flops/cycle) / (bytes/cycle) * sizeof(element) -# -# For example Haswell is capable of -# - 32 single-precision FLOPs/cycle -# - 32 bytes/cycle store and 64 bytes/cycle load (store C, load A and B) -# -# so nr >= 32/32 * 4 -# For that number of FLOP it must schedule -# 2xFMA so consume 16 single-precision float -# so mr*nr >= 16 - -type - MicroKernel* = object - mr*, nr*: int - cpu_simd*: CPUFeatureX86 - nb_scalars*: int # Ideally MicroKernel should be generic over T - nb_vecs_nr*: int - c_unit_stride*: bool # We can use SIMD for the epilogue of C has a unit_stride - pt*: int # Parallelization threshold - - # TODO: ARM support - # - https://github.com/nim-lang/Nim/issues/9679 - # - https://github.com/nim-lang/Nim/issues/9678 - - CPUFeatureX86* = enum - x86_Generic, - x86_SSE, - x86_SSE2, - x86_SSE4_1, - x86_AVX, - x86_AVX_FMA, - x86_AVX2, - x86_AVX512 - # Note that Skylake SP, Xeon Bronze Silver and Gold 5XXX - # only have a single AVX512 port and AVX2 can be faster - # due to AVX512 downclocking - - X86_FeatureMap = array[CPUFeatureX86, int] - -const X86_vecwidth_float: X86_FeatureMap = [ - x86_Generic: 1, - x86_SSE: 128 div 8, - x86_SSE2: 128 div 8, - x86_SSE4_1: 128 div 8, - x86_AVX: 256 div 8, - x86_AVX_FMA: 256 div 8, - x86_AVX2: 256 div 8, - x86_AVX512: 512 div 8 -] - -const X86_vecwidth_int: X86_FeatureMap = [ - x86_Generic: 1, - x86_SSE: 1, - x86_SSE2: 128 div 8, - x86_SSE4_1: 128 div 8, - x86_AVX: 128 div 8, # Not even addition with integer AVX - x86_AVX_FMA: 128 div 8, - x86_AVX2: 256 div 8, - x86_AVX512: 512 div 8 -] - -# Registers constraints and micro-kernel tuning -# - To issue 2xFMAs in parallel we need to use 2x SIMD registers -# - We want to hold C of size MR * NR completely in SIMD registers as well -# as each value is reused k times during accumulation C[i, j] += A[i, k] * B[k, j] -# - We should have enough SIMD registers left to hold -# the corresponding sections of A and B (at least 4, 2xA and 2xB for FMAs) -# -# On x86-64 X SIMD registers that can issue 2xFMAs per cycle: -# - NbVecs is 2 minimum -# - RegsPerVec = 2 * NbVecs => 4 minimum (for A and for B) -# - NR = NbVecs * NbScalarsPerSIMD -# - C: MR*NR and uses MR*NbVecs SIMD registers -# - MR*NbVecs + RegsPerVec <= X -# -> MR*NbVecs + 2 * NbVecs <= X -# -> (MR+2) * NbVecs <= X -# -# Some solutions: -# - AVX with 16 registers: -# - MR = 6, NbVecs = 2 -# FP32: 8xFP32 per SIMD --> NR = 2x8 -# ukernel = 6x16 -# FP64, ukernel = 6x8 -# - MR = 2, NbVecs = 4 -# FP32: 8xFP32 per SIMD --> NR = 4x8 -# ukernel = 2x32 -# FP64, ukernel = 2x16 -# - AVX512 with 32 registers -# - MR = 6, NbVecs = 4 -# FP32 ukernel = 6x64 -# FP64 ukernel = 6x32 -# - MR = 2, NbVecs = 8 -# FP32 ukernel = 2x128 -# FP64 ukernel = 2x64 -# - MR = 14, NbVecs = 2 -# FP32 ukernel = 14x32 -# FP64 ukernel = 14x16 -when defined(amd64): # 64-bit - # MR configuration - rows of à in micro kernel - # 16 General purpose registers - const X86_regs: X86_FeatureMap = [ - x86_Generic: 2, - x86_SSE: 6, - x86_SSE2: 6, - x86_SSE4_1: 6, - x86_AVX: 6, - x86_AVX_FMA: 6, - x86_AVX2: 6, - x86_AVX512: 14 - ] - - # NR configuration - Nb of ~B SIMD vectors - # We will also keep as many rows of à in SIMD registers at the same time - const NbVecs: X86_FeatureMap = [ - x86_Generic: 1, - x86_SSE: 2, # 16 XMM registers - x86_SSE2: 2, - x86_SSE4_1: 2, - x86_AVX: 2, # 16 YMM registers - x86_AVX_FMA: 2, - x86_AVX2: 2, - x86_AVX512: 2 # 32 ZMM registers - ] -else: # 32-bit - # MR configuration - registers for the rows of à - # 8 General purpose registers - const X86_regs: X86_FeatureMap = [ - x86_Generic: 2, - x86_SSE: 2, - x86_SSE2: 2, - x86_SSE4_1: 2, - x86_AVX: 2, - x86_AVX_FMA: 2, - x86_AVX2: 2, - x86_AVX512: 2 - ] - - # NR configuration - Nb of ~B SIMD vectors - const NbVecs: X86_FeatureMap = [ - x86_Generic: 1, - x86_SSE: 2, # 8 XMM registers - x86_SSE2: 2, - x86_SSE4_1: 2, - x86_AVX: 2, # 8 YMM registers - x86_AVX_FMA: 2, - x86_AVX2: 2, - x86_AVX512: 2 # 8 ZMM registers - ] - -func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKernel = - result.cpu_simd = cpu - result.c_unit_stride = c_unit_stride - result.pt = 128 - when T is SomeFloat: - result.nb_scalars = max(1, X86_vecwidth_float[cpu] div T.sizeof) - elif T is SomeInteger: # Integers - result.nb_scalars = max(1, X86_vecwidth_int[cpu] div T.sizeof) - else: - {.error: "Unsupported type: " & T.type.name.} - - # The inner microkernel loop does: - # AB[m][n] = A[m] * B[n] - # So n should be the vector size - # if most matrices are row-Major. - # This avoids dealing with transpose - # in the inner loop and untranspose in the epilogue - - result.mr = X86_regs[cpu] # 2~6 registers for the rows of à - result.nb_vecs_nr = NbVecs[cpu] # SIMD vectors of B - result.nr = result.nb_vecs_nr * result.nb_scalars - -############################################# -# Workaround "undeclared identifier mr or nr" -# for some reason the compiler cannot access fields in -# the static MicroKernel. - -macro extract_mr*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.mr -macro extract_nr*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.nr -macro extract_cpu_simd*(ukernel: static MicroKernel): untyped = - let simd = ukernel.cpu_simd - result = quote do: CPUFeatureX86(`simd`) -macro extract_nb_scalars*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.nb_scalars -macro extract_nb_vecs_nr*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.nb_vecs_nr -macro extract_c_unit_stride*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.c_unit_stride -macro extract_pt*(ukernel: static MicroKernel): untyped = - result = newLit ukernel.pt - - -# ############################################################ -# -# Loop tiling -# -# ############################################################ - -# multithreading info in [2] and https://github.com/flame/blis/blob/master/docs/Multithreading.md - -type Tiles*[T] = ref object - a*: ptr UncheckedArray[T] - b*: ptr UncheckedArray[T] - mc*, nc*, kc*: int - - # Multithreaded panels - ic_num_tasks*: int # For private L1-L2 and shared L3 - upanelA_size*: int # Each thread uses a different upanel of A - - # Allocation data - a_alloc_mem: pointer - b_alloc_mem: pointer - # The Tiles data structure takes 64-byte = 1 cache-line - - -proc deallocTiles[T](tiles: Tiles[T]) = - if not tiles.a_alloc_mem.isNil: - deallocShared tiles.a_alloc_mem - if not tiles.b_alloc_mem.isNil: - deallocShared tiles.b_alloc_mem - -func get_num_tiles*(dim_size, tile_size: int): int {.inline.} = - ## Get the number of tiles along a dimension depending on the tile size - (dim_size + tile_size - 1) div tile_size - -func partitionMNK*( - ukernel: static MicroKernel, - T: typedesc, - M, N, K: Natural, - ): tuple[mc, nc, kc: int] = - - result.nc = N # We don't partition over N - - # ## Panel sizes - # - TLB constraint - # TA ̃ + 2(TBj + TCj)≤T - # Note: optimizing the similar problem mckc/(2mc+2kc) - # under the constraint that mckc ≤ K is the problem - # of maximizing the area of a rectangle - # while minimizing the perimeter, - # - # Goto paper [1] section 6.3: choosing kc - # - kc should be as large as possible to amortize the mr*nr updates of Cj - # - Elements from Bj [kc, nr] must remain in L1 cache. - # - kc * nr should occupy less than half the L1 cache - # so that à and Caux do not evict element of Bj - # - à [kc, mc] should occupy - # a considerable fraction of the L2 cache - # In our experience optimal choice is so that "kc" float64 occupy half a page - # -> a page is 4096 bytes = 512 float64 -> half a page = 256 - - # Goto paper [1] section 6.3: choosing mc - # - mc*kc should fill a considerable part of (1) the memory addressable - # by the TLB and (2) the L2 cache - # In practice mc is chosen so that A occupies about half the smaller of (1) and (2) - - - # TODO: heuristics to compute the size - result.mc = min( 768 div T.sizeof, M) - result.kc = min(2048 div T.sizeof, K) - -proc newTiles*( - ukernel: static MicroKernel, - T: typedesc, - M, N, K: Natural, - ): Tiles[T] = - # BLIS paper [2] section II Figure 2: - # - kc * nr in L1 cache µkernel - # - mc * kc in L2 cache à - # - kc * nc in L3 cache ~B (no L3 in Xeon Phi ¯\_(ツ)_/¯) - new result, deallocTiles[T] - const - nr = ukernel.nr - mr = ukernel.mr - - (result.mc, result.nc, result.kc) = ukernel.partitionMNK(T, M, N, K) - - # Parallel config - # Ic loop parallel means that each thread will share a panel B and pack a different A - result.ic_num_tasks = get_num_tiles(M, result.mc) - - # Packing - # During packing the max size is unroll_stop*kc+kc*LR, LR = MR or NR - result.upanelA_size = result.kc*round_step_up(result.mc, mr) - let bufA_size = T.sizeof * result.upanelA_size * result.ic_num_tasks - let bufB_size = T.sizeof * result.kc*round_step_up(result.nc, nr) - - result.a_alloc_mem = allocShared(bufA_size + 63) - result.b_alloc_mem = allocShared(bufB_size + 63) - result.a = assume_aligned align_raw_data(T, result.a_alloc_mem) - result.b = assume_aligned align_raw_data(T, result.b_alloc_mem) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim deleted file mode 100644 index b074d74..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim +++ /dev/null @@ -1,44 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -template float32x8_muladd_unfused(a, b, c: m256): m256 = - mm256_add_ps(mm256_mul_ps(a, b), c) - -template float64x4_muladd_unfused(a, b, c: m256d): m256d = - mm256_add_pd(mm256_mul_pd(a, b), c) - -ukernel_generator( - x86_AVX, - typ = float32, - vectype = m256, - nb_scalars = 8, - simd_setZero = mm256_setzero_ps, - simd_broadcast_value = mm256_set1_ps, - simd_load_aligned = mm256_load_ps, - simd_load_unaligned = mm256_loadu_ps, - simd_store_unaligned = mm256_storeu_ps, - simd_mul = mm256_mul_ps, - simd_add = mm256_add_ps, - simd_fma = float32x8_muladd_unfused - ) - -ukernel_generator( - x86_AVX, - typ = float64, - vectype = m256d, - nb_scalars = 4, - simd_setZero = mm256_setzero_pd, - simd_broadcast_value = mm256_set1_pd, - simd_load_aligned = mm256_load_pd, - simd_load_unaligned = mm256_loadu_pd, - simd_store_unaligned = mm256_storeu_pd, - simd_mul = mm256_mul_pd, - simd_add = mm256_add_pd, - simd_fma = float64x4_muladd_unfused - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim deleted file mode 100644 index d1d7b15..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim +++ /dev/null @@ -1,35 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -template int32x8_muladd_unfused_avx2(a, b, c: m256i): m256i = - mm256_add_epi32(mm256_mullo_epi32(a, b), c) - -template int32x8_loada(mem_addr: ptr int32): m256i = - mm256_load_si256(cast[ptr m256i](mem_addr)) - -template int32x8_loadu(mem_addr: ptr int32): m256i = - mm256_loadu_si256(cast[ptr m256i](mem_addr)) - -template int32x8_storeu(mem_addr: ptr int32, a: m256i) = - mm256_storeu_si256(cast[ptr m256i](mem_addr), a) - -ukernel_generator( - x86_AVX2, - typ = int32, - vectype = m256i, - nb_scalars = 8, - simd_setZero = mm256_setzero_si256, - simd_broadcast_value = mm256_set1_epi32, - simd_load_aligned = int32x8_loada, - simd_load_unaligned = int32x8_loadu, - simd_store_unaligned = int32x8_storeu, - simd_mul = mm256_mullo_epi32, - simd_add = mm256_add_epi32, - simd_fma = int32x8_muladd_unfused_avx2 - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim deleted file mode 100644 index d52beab..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim +++ /dev/null @@ -1,74 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -ukernel_generator( - x86_AVX512, - typ = float32, - vectype = m512, - nb_scalars = 16, - simd_setZero = mm512_setzero_ps, - simd_broadcast_value = mm512_set1_ps, - simd_load_aligned = mm512_load_ps, - simd_load_unaligned = mm512_loadu_ps, - simd_store_unaligned = mm512_storeu_ps, - simd_mul = mm512_mul_ps, - simd_add = mm512_add_ps, - simd_fma = mm512_fmadd_ps - ) - -ukernel_generator( - x86_AVX512, - typ = float64, - vectype = m512d, - nb_scalars = 8, - simd_setZero = mm512_setzero_pd, - simd_broadcast_value = mm512_set1_pd, - simd_load_aligned = mm512_load_pd, - simd_load_unaligned = mm512_loadu_pd, - simd_store_unaligned = mm512_storeu_pd, - simd_mul = mm512_mul_pd, - simd_add = mm512_add_pd, - simd_fma = mm512_fmadd_pd - ) - -template int32x16_muladd_unfused_avx512(a, b, c: m512i): m512i = - mm512_add_epi32(mm512_mullo_epi32(a, b), c) - -ukernel_generator( - x86_AVX512, - typ = int32, - vectype = m512i, - nb_scalars = 16, - simd_setZero = mm512_setzero_si512, - simd_broadcast_value = mm512_set1_epi32, - simd_load_aligned = mm512_load_si512, - simd_load_unaligned = mm512_loadu_si512, - simd_store_unaligned = mm512_storeu_si512, - simd_mul = mm512_mullo_epi32, - simd_add = mm512_add_epi32, - simd_fma = int32x16_muladd_unfused_avx512 - ) - -template int64x8_muladd_unfused_avx512(a, b, c: m512i): m512i = - mm512_add_epi64(mm512_mullo_epi64(a, b), c) - -ukernel_generator( - x86_AVX512, - typ = int64, - vectype = m512i, - nb_scalars = 8, - simd_setZero = mm512_setzero_si512, - simd_broadcast_value = mm512_set1_epi64, - simd_load_aligned = mm512_load_si512, - simd_load_unaligned = mm512_loadu_si512, - simd_store_unaligned = mm512_storeu_si512, - simd_mul = mm512_mullo_epi64, - simd_add = mm512_add_epi64, - simd_fma = int64x8_muladd_unfused_avx512 - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim deleted file mode 100644 index 112a040..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim +++ /dev/null @@ -1,38 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -ukernel_generator( - x86_AVX_FMA, - typ = float32, - vectype = m256, - nb_scalars = 8, - simd_setZero = mm256_setzero_ps, - simd_broadcast_value = mm256_set1_ps, - simd_load_aligned = mm256_load_ps, - simd_load_unaligned = mm256_loadu_ps, - simd_store_unaligned = mm256_storeu_ps, - simd_mul = mm256_mul_ps, - simd_add = mm256_add_ps, - simd_fma = mm256_fmadd_ps - ) - -ukernel_generator( - x86_AVX_FMA, - typ = float64, - vectype = m256d, - nb_scalars = 4, - simd_setZero = mm256_setzero_pd, - simd_broadcast_value = mm256_set1_pd, - simd_load_aligned = mm256_load_pd, - simd_load_unaligned = mm256_loadu_pd, - simd_store_unaligned = mm256_storeu_pd, - simd_mul = mm256_mul_pd, - simd_add = mm256_add_pd, - simd_fma = mm256_fmadd_pd, - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim deleted file mode 100644 index 7499b93..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim +++ /dev/null @@ -1,125 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - macros, - ../laser_utils/compiler_optim_hints, - ./gemm_tiling, ./gemm_utils, - ./gemm_ukernel_generic, - ./gemm_ukernel_sse, - ./gemm_ukernel_sse2, - ./gemm_ukernel_sse4_1, - ./gemm_ukernel_avx, - ./gemm_ukernel_avx_fma, - ./gemm_ukernel_avx2, - ./gemm_ukernel_avx512 - -{.experimental: "dynamicBindSym".} - -# ############################################################ -# -# Dispatch with runtime cpu detection -# -# ############################################################ - -template dispatch_common {.dirty.} = - let simd = ukernel.cpu_simd - let MR = ukernel.mr - let nb_scalars = ukernel.nb_scalars - - result = newStmtList() - - # 1. Prefetch packedB (used in microkernel) - # and C (used in epilogue update) - result.add quote do: - prefetch(`packedB`, Read, LowTemporalLocality) - prefetch(`packedB` + `nb_scalars`, Read, LowTemporalLocality) - for i in 0 ..< `MR`: - prefetch(`vC`[i, 0].addr, Write, HighTemporalLocality) - - # 2. Dispatch according to type and SIMD support - let symT = getTypeInst(alpha) - - -macro dispatch_general( - ukernel: static MicroKernel, - kc: int, - alpha: typed, packedA, packedB: ptr UncheckedArray[typed], - beta: typed, vC: MatrixView[typed] - ): untyped = - - dispatch_common() - - # 2.1. No SIMD case - if simd == x86_Generic: - result.add quote do: - gebb_ukernel_fallback[`symT`, ukernel]( # Hack: ukernel is generic from the calling proc - `kc`, - `alpha`, `packedA`, `packedB`, - `beta`, `vC` - ) - return - - # 2.2. SIMD case - let simdTag = $simd - let ukernel_name = bindSym("gebb_ukernel_" & $symT & "_" & simdTag) - result.add quote do: - `ukernel_name`[ukernel]( # Hack: ukernel is generic from the calling proc - `kc`, - `alpha`, `packedA`, `packedB`, - `beta`, `vC` - ) - -proc gebb_ukernel*[T; ukernel: static MicroKernel]( - kc: int, - alpha: T, packedA, packedB: ptr UncheckedArray[T], - beta: T, vC: MatrixView[T] - ){.inline.} = - - ukernel.dispatch_general(kc, alpha, packedA, packedB, beta, vC) - - -# ############################################################ -# -# Exported proc -# -# ############################################################ - -macro dispatch_edge( - ukernel: static MicroKernel, - mr, nr, kc: int, - alpha: typed, packedA, packedB: ptr UncheckedArray[typed], - beta: typed, vC: MatrixView[typed] - ): untyped = - - dispatch_common() - - # 2.1. No SIMD case - if simd == x86_Generic: - result.add quote do: - gebb_ukernel_edge_fallback[`symT`, ukernel]( # Hack: ukernel is generic from the calling proc - `mr`, `nr`, `kc`, - `alpha`, `packedA`, `packedB`, - `beta`, `vC` - ) - return - - # 2.2. SIMD case - let simdTag = $simd - let ukernel_name = bindSym("gebb_ukernel_edge_" & $symT & "_" & simdTag) - result.add quote do: - `ukernel_name`[ukernel]( # Hack: ukernel is generic from the calling proc - `mr`, `nr`, `kc`, - `alpha`, `packedA`, `packedB`, - `beta`, `vC` - ) - -proc gebb_ukernel_edge*[T; ukernel: static MicroKernel]( - mr, nr, kc: int, - alpha: T, packedA, packedB: ptr UncheckedArray[T], - beta: T, vC: MatrixView[T] - ){.inline.} = - - ukernel.dispatch_edge(mr, nr, kc, alpha, packedA, packedB, beta, vC) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim deleted file mode 100644 index f15c6a5..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim +++ /dev/null @@ -1,250 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ../laser_utils/compiler_optim_hints, - ../laser_utils/simd, - ./gemm_tiling, ./gemm_utils, - ./gemm_ukernel_generic, - macros - -# ############################################################ -# -# SIMD implementation generator -# -# ############################################################ - -# Macro ukernel_generator should be invoked in different files so that specific -# flags like "-mavx -mfma" are isolated. -# Add the corresponding compilation flags to "nim.cfg" - -# ############################################################# - -template ukernel_simd_proc(ukernel_name, epilogue_name: NimNode, edge: bool) {.dirty.} = - if edge: - result.add quote do: - proc `ukernel_name`*[ukernel: static MicroKernel]( - mr, nr, kc: int, - alpha: `T`, packedA, packedB: ptr UncheckedArray[`T`], - beta: `T`, vC: MatrixView[`T`] - ) = - - let AB{.align_variable.} = ukernel_simd_impl( - ukernel, `V`, packedA, packedB, kc, - `simd_setZero`, `simd_load_aligned`, `simd_broadcast_value`, `simd_fma` - ) - const - is_c_unit_stride = ukernel.extract_c_unit_stride() - MR = ukernel.extract_mr() - NR = ukernel.extract_nr() - - gebb_ukernel_edge_epilogue( - alpha, to_ptr(AB, MR, NR, `T`), - beta, vC, mr, nr - ) - else: - result.add quote do: - proc `ukernel_name`*[ukernel: static MicroKernel]( - kc: int, - alpha: `T`, packedA, packedB: ptr UncheckedArray[`T`], - beta: `T`, vC: MatrixView[`T`] - ) = - let AB{.align_variable.} = ukernel_simd_impl( - ukernel, `V`, packedA, packedB, kc, - `simd_setZero`, `simd_load_aligned`, `simd_broadcast_value`, `simd_fma` - ) - const - is_c_unit_stride = ukernel.extract_c_unit_stride() - MR = ukernel.extract_mr() - NR = ukernel.extract_nr() - - # when is_c_unit_stride: - # `epilogue_name`(alpha, AB, beta, vC) - # else: - gebb_ukernel_epilogue_fallback( - alpha, to_ptr(AB, MR, NR, `T`), - beta, vC) - -# ############################################################# - -template epilogue() {.dirty.} = - result.add quote do: - proc `epilogue_name`[MR, NbVecs: static int]( - alpha: `T`, AB: array[MR, array[NbVecs, `V`]], - beta: `T`, vC: MatrixView[`T`] - ) = - template C(i,j: int): untyped {.dirty.} = - vC.buffer[i*vC.rowStride + j*`nb_scalars`] - - if beta == 0.`T`: - for i in 0 ..< MR: - for j in 0 ..< NbVecs: - `simd_store_unaligned`(C(i,j).addr, `simd_setZero`()) - elif beta != 1.`T`: - let beta_vec = `simd_broadcast_value`(beta) - for i in 0 ..< MR: - for j in 0 ..< NbVecs: - `simd_store_unaligned`(C(i,j).addr, `simd_mul`(beta_vec, C(i,j).addr.`simd_load_unaligned`)) - - if alpha == 1.`T`: - for i in 0 ..< MR: - for j in 0 ..< NbVecs: - `simd_store_unaligned`(C(i,j).addr, `simd_add`(AB[i][j], C(i,j).addr.`simd_load_unaligned`)) - else: - let alpha_vec = `simd_broadcast_value`(alpha) - for i in 0 ..< MR: - for j in 0 ..< NbVecs: - `simd_store_unaligned`(C(i,j).addr, `simd_fma`(alpha_vec, AB[i][j], C(i,j).addr.`simd_load_unaligned`)) - -# ############################################################# - -macro ukernel_generator*( - simd: static CPUFeatureX86, - typ: untyped, - vectype: untyped, - nb_scalars: static int, - simd_setZero: untyped, - simd_broadcast_value: untyped, - simd_load_aligned: untyped, - simd_load_unaligned: untyped, - simd_store_unaligned: untyped, - simd_mul: untyped, - simd_add: untyped, - simd_fma: untyped, - ): untyped = - - let T = newIdentNode($typ) - let V = newIdentNode($vectype) - let epilogue_name = newIdentNode("gebb_ukernel_epilogue_" & $T & "_" & $simd) - result = newStmtList() - - # 1. Generate the epilogue function - epilogue() - - # 2. Generate the microkernels for the general and edge cases - block: - let ukernel_name = newIdentNode("gebb_ukernel_" & $T & "_" & $simd) - ukernel_simd_proc(ukernel_name, epilogue_name, edge = false) - block: - let ukernel_name = newIdentNode("gebb_ukernel_edge_" & $T & "_" & $simd) - ukernel_simd_proc(ukernel_name, epilogue_name, edge = true) - -# ############################################################ -# -# Actual SIMD implementation -# -# ############################################################ - -macro ukernel_simd_impl*( - ukernel: static MicroKernel, V: untyped, A, B: untyped, kc: int, - simd_setZero, simd_load_aligned, simd_broadcast_value, simd_fma: untyped - ): untyped = - - - let MR = ukernel.mr - let NR = ukernel.nr - - if false: # Debug implementation - result = quote do: - var AB{.align_variable.}: array[`MR`, array[`NR`, float64]] - var A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr - var B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr - - for k in 0 ..< kc: - prefetch(B[(k+1)*`NR`].addr, Read, LowTemporalLocality) - for i in 0 ..< `MR`: - for j in 0 ..< `NR`-1: - AB[i][j] += A[k*`MR`+i] * B[k*`NR`+j] - AB - - else: # Vectorized implementation - result = newStmtList() - - ## ukernel config - let - MR = ukernel.mr - NR = ukernel.nr - NbVecs = ukernel.nb_vecs_nr # == NR div NbScalars - NbScalars = ukernel.nb_scalars - - ## Registers - # We keep all C in registers MR*NR size occupying MR*NbVecs - # We keep NbVecs slivers of A and B for C updates - var - rA: seq[NimNode] # array[NbVecs, V] - rB: seq[NimNode] # array[NbVecs, V] - rAB = nnkBracket.newTree() # array[MR, array[NbVecs, V]] - for jj in 0 ..< NbVecs: - rA.add genSym(nskVar, "A" & $jj) - rB.add genSym(nskVar, "B" & $jj) - for i in 0 ..< MR: - var rABi = nnkBracket.newTree() - for j in 0 ..< NbVecs: - rABi.add genSym(nskVar, "AB" & $i & "__" & $j) - rAB.add rABi - - ## Declare - var declBody = newStmtList() - for a in rA: - declBody.add quote do: - var `a`{.noinit.}: `V` - for b in rB: - declBody.add quote do: - var `b`{.noinit.}: `V` - for i in 0 ..< MR: - for j in 0 ..< NbVecs: - let ab = rAB[i][j] - declBody.add quote do: - var `ab` = `simd_setZero`() - - let k = genSym(nskForVar) - - ## Prefetch - var prefetchBody = newStmtList() - for jj in 0 ..< NbVecs: - prefetchBody.add quote do: - prefetch(`B`[(`k`+1)*`NR`+`jj`*`NbScalars`].addr, Read, LowTemporalLocality) - - ## Load - var loadBody = newStmtList() - for jj in 0 ..< NbVecs: - let b = rB[jj] - loadBody.add quote do: - `b` = `simd_load_aligned`(`B`[`k`*`NR`+`jj`*`NbScalars`].addr) - - ## Interleaved broadcast and FMA - var bcast_fma = newStmtList() - block: - let a0 = rA[0] - bcast_fma.add quote do: - `a0` = `simd_broadcast_value`(`A`[`k`*`MR`]) - - for i in 0 ..< MR: - # broadcast next iteration - let next_register_idx = (i+1) mod NbVecs - let a_next = rA[next_register_idx] - bcast_fma.add quote do: - # At the edge: `i`+1 = MR so equivalent to loading A[(k+1)*MR] - `a_next` = `simd_broadcast_value`(`A`[`k`*`MR`+(`i`+1)]) - - # load current - let a = rA[i mod NbVecs] - - # Do FMA on the current one - for jj in 0 ..< NbVecs: - let b = rB[jj] - let AB = rAB[i][jj] - bcast_fma.add quote do: - `AB` = `simd_fma`(`a`, `b`, `AB`) - - ## Assemble: - result = quote do: - `declBody` - for `k` in 0 ..< `kc`: - `loadBody` - `prefetchBody` - `bcast_fma` - ## Write registers to a MR/NR array - `rAB` diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim deleted file mode 100644 index ad13189..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim +++ /dev/null @@ -1,138 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -# Generic microkernel for matrix multiplication - -import - ../../cpuinfo, - ../laser_utils/compiler_optim_hints, - ./gemm_tiling, ./gemm_utils, - macros - -withCompilerOptimHints() - -# ############################################################ -# -# Generic GEBB microkernel implementation -# -# ############################################################ - -template ukernel_generic_impl*(){.dirty.} = - const - MR = ukernel.extract_mr() - NR = ukernel.extract_nr() - simd = ukernel.extract_cpu_simd - - var AB{.align_variable.}: array[MR, array[NR, T]] - var A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr - var B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr - - for k in 0 ..< kc: - prefetch(B[(k+1)*NR].addr, Read, LowTemporalLocality) - for i in 0 ..< MR: - for j in `||`(0, NR-1, "simd"): - AB[i][j] += A[k*MR+i] * B[k*NR+j] - -# ############################################################ -# -# Fallback Generic version -# -# ############################################################ -# -# Cases -# 1. C *= β, starting default -# 2. C = AB, if β = 0 and α = 1 -# 3. C = αAB, if β = 0 and α = 1 -# 4. C += AB, if α = 1 -# 5. C += αAB, if α = 1 -# -# TODO: Fused operations like relu/sigmoid/tanh -# should be done here as well - -proc gebb_ukernel_epilogue_fallback*[MR, NR: static int, T]( - alpha: T, AB: ptr array[MR, array[NR, T]], - beta: T, vC: MatrixView[T] - ){.inline.} = - - let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeAddr) - - if beta == 0.T: - for i in 0 ..< MR: - for j in 0 ..< NR: - vC[i, j] = 0.T - elif beta != 1.T: # C *= β - for i in 0 ..< MR: - for j in 0 ..< NR: - vC[i, j] *= beta - - if alpha == 1.T: # C += AB - for i in 0 ..< MR: - for j in 0 ..< NR: - vC[i, j] += pAB[i][j] - else: # C += αAB - for i in 0 ..< MR: - for j in 0 ..< NR: - vC[i, j] += alpha * pAB[i][j] - - # TODO: Fused operations like relu/sigmoid/tanh - # should be done here as well - -proc gebb_ukernel_fallback*[T; ukernel: static MicroKernel]( - kc: int, - alpha: T, packedA, packedB: ptr UncheckedArray[T], - beta: T, vC: MatrixView[T] - ) = - ukernel_generic_impl() - - const is_c_unit_stride = ukernel.extract_c_unit_stride - gebb_ukernel_epilogue_fallback(alpha, to_ptr(AB, MR, NR, T), beta, vC) - -# ############################################################ -# -# Matrix edges -# -# ############################################################ - -func gebb_ukernel_edge_epilogue*[MR, NR: static int, T]( - alpha: T, AB: ptr array[MR, array[NR, T]], - beta: T, vC: MatrixView[T], - mr, nr: int # Tail to process - ){.inline.} = - - let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeAddr) - - if beta == 0.T: - if alpha == 1.T: # C = AB - for i in 0 ..< mr: - for j in 0 ..< nr: - vC[i, j] = pAB[i][j] - else: # C = αAB - for i in 0 ..< mr: - for j in 0 ..< nr: - vC[i, j] = alpha * pAB[i][j] - else: # C *= β - for i in 0 ..< mr: - for j in 0 ..< nr: - vC[i, j] *= beta - - if alpha == 1.T: # C += AB - for i in 0 ..< mr: - for j in 0 ..< nr: - vC[i, j] += pAB[i][j] - else: # C += αAB - for i in 0 ..< mr: - for j in 0 ..< nr: - vC[i, j] += alpha * pAB[i][j] - - # TODO: Fused operations like relu/sigmoid/tanh - # should be done here as well - -proc gebb_ukernel_edge_fallback*[T; ukernel: static MicroKernel]( - mr, nr, kc: int, - alpha: T, packedA, packedB: ptr UncheckedArray[T], - beta: T, vC: MatrixView[T] - ) = - ukernel_generic_impl() - gebb_ukernel_edge_epilogue(alpha, to_ptr(AB, MR, NR, T), beta, vC, mr, nr) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim deleted file mode 100644 index acdf500..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim +++ /dev/null @@ -1,26 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -template float32x4_muladd_unfused(a, b, c: m128): m128 = - mm_add_ps(mm_mul_ps(a, b), c) - -ukernel_generator( - x86_SSE, - typ = float32, - vectype = m128, - nb_scalars = 4, - simd_setZero = mm_setzero_ps, - simd_broadcast_value = mm_set1_ps, - simd_load_aligned = mm_load_ps, - simd_load_unaligned = mm_loadu_ps, - simd_store_unaligned = mm_storeu_ps, - simd_mul = mm_mul_ps, - simd_add = mm_add_ps, - simd_fma = float32x4_muladd_unfused - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim deleted file mode 100644 index a5d47fa..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim +++ /dev/null @@ -1,129 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -template float64x2_muladd_unfused(a, b, c: m128d): m128d = - mm_add_pd(mm_mul_pd(a, b), c) - -ukernel_generator( - x86_SSE2, - typ = float64, - vectype = m128d, - nb_scalars = 2, - simd_setZero = mm_setzero_pd, - simd_broadcast_value = mm_set1_pd, - simd_load_aligned = mm_load_pd, - simd_load_unaligned = mm_loadu_pd, - simd_store_unaligned = mm_storeu_pd, - simd_mul = mm_mul_pd, - simd_add = mm_add_pd, - simd_fma = float64x2_muladd_unfused - ) - -####################################### -# -# Int32: hack to unroll scalar code -# -####################################### - -# This is faster than using the fallback for mm_mullo_epi32 -# in laser/primitives/private/sse2_utils - -# Note that we are quite limited in registers with scalar code -# as those are competing with all loop controls, conditions, ... - -type Int32x2 = array[2, int32] - -func setZero_int32_sse2_fallback(): Int32x2 {.inline.} = - discard - -template set1_int32_sse2_fallback(a: int32): Int32x2 = - [a, a] - -func load_int32_sse2_fallback(mem_addr: ptr int32): Int32x2 {.inline.}= - let p = cast[ptr UncheckedArray[int32]](mem_addr) - [p[0], p[1]] - -func store_int32_sse2_fallback(mem_addr: ptr int32, a: Int32x2) {.inline.}= - let p = cast[ptr UncheckedArray[int32]](mem_addr) - p[0] = a[0] - p[1] = a[1] - -template add_int32_sse2_fallback(a, b: Int32x2): Int32x2 = - [a[0] + b[0], a[1] + b[1]] - -template mul_int32_sse2_fallback(a, b: Int32x2): Int32x2 = - [a[0] * b[0], a[1] * b[1]] - -template fma_int32_sse2_fallback(a, b, c: Int32x2): Int32x2 = - ## By mistake I had c[0] instead of c[1] and twice the speed - [c[0] + a[0]*b[0], c[1] + a[1]*b[1]] - -ukernel_generator( - x86_SSE2, - typ = int32, - vectype = Int32x2, - nb_scalars = 2, - simd_setZero = setZero_int32_sse2_fallback, - simd_broadcast_value = set1_int32_sse2_fallback, - simd_load_aligned = load_int32_sse2_fallback, - simd_load_unaligned = load_int32_sse2_fallback, - simd_store_unaligned = store_int32_sse2_fallback, - simd_mul = mul_int32_sse2_fallback, - simd_add = add_int32_sse2_fallback, - simd_fma = fma_int32_sse2_fallback - ) - - -####################################### -# -# Int64: hack to unroll scalar code -# -####################################### - -type Int64x2 = array[2, int64] - -func setZero_int64_sse2_fallback(): Int64x2 {.inline.} = - discard - -template set1_int64_sse2_fallback(a: int64): Int64x2 = - [a, a] - -func load_int64_sse2_fallback(mem_addr: ptr int64): Int64x2 {.inline.}= - let p = cast[ptr UncheckedArray[int64]](mem_addr) - [p[0], p[1]] - -func store_int64_sse2_fallback(mem_addr: ptr int64, a: Int64x2) {.inline.}= - let p = cast[ptr UncheckedArray[int64]](mem_addr) - p[0] = a[0] - p[1] = a[1] - -template add_int64_sse2_fallback(a, b: Int64x2): Int64x2 = - [a[0] + b[0], a[1] + b[1]] - -template mul_int64_sse2_fallback(a, b: Int64x2): Int64x2 = - [a[0] * b[0], a[1] * b[1]] - -template fma_int64_sse2_fallback(a, b, c: Int64x2): Int64x2 = - ## By mistake I had c[0] instead of c[1] and twice the speed - [c[0] + a[0]*b[0], c[1] + a[1]*b[1]] - -ukernel_generator( - x86_SSE2, - typ = int64, - vectype = Int64x2, - nb_scalars = 2, - simd_setZero = setZero_int64_sse2_fallback, - simd_broadcast_value = set1_int64_sse2_fallback, - simd_load_aligned = load_int64_sse2_fallback, - simd_load_unaligned = load_int64_sse2_fallback, - simd_store_unaligned = store_int64_sse2_fallback, - simd_mul = mul_int64_sse2_fallback, - simd_add = add_int64_sse2_fallback, - simd_fma = fma_int64_sse2_fallback - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim deleted file mode 100644 index f7c724e..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim +++ /dev/null @@ -1,35 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import - ./gemm_ukernel_generator, ./gemm_tiling, - ../laser_utils/simd - -template int32x4_muladd_unfused_sse4_1(a, b, c: m128i): m128i = - mm_add_epi32(mm_mullo_epi32(a, b), c) - -template int32x4_loada(mem_addr: ptr int32): m128i = - mm_load_si128(cast[ptr m128i](mem_addr)) - -template int32x4_loadu(mem_addr: ptr int32): m128i = - mm_loadu_si128(cast[ptr m128i](mem_addr)) - -template int32x4_storeu(mem_addr: ptr int32, a: m128i) = - mm_storeu_si128(cast[ptr m128i](mem_addr), a) - -ukernel_generator( - x86_SSE4_1, - typ = int32, - vectype = m128i, - nb_scalars = 4, - simd_setZero = mm_setzero_si128, - simd_broadcast_value = mm_set1_epi32, - simd_load_aligned = int32x4_loada, - simd_load_unaligned = int32x4_loadu, - simd_store_unaligned = int32x4_storeu, - simd_mul = mm_mullo_epi32, - simd_add = mm_add_epi32, - simd_fma = int32x4_muladd_unfused_sse4_1 - ) diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim b/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim deleted file mode 100644 index 82ceb79..0000000 --- a/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim +++ /dev/null @@ -1,60 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -# ############################################################ -# -# Pointer arithmetics -# -# ############################################################ - -# Warning for pointer arithmetics be careful of not passing a `var ptr` -# to a function as `var` are passed by hidden pointers in Nim and the wrong -# pointer will be modified. Templates are fine. - -func `+`*(p: ptr, offset: int): type(p) {.inline.}= - ## Pointer increment - {.emit: "`result` = `p` + `offset`;".} - -# ############################################################ -# -# Conversion of the AB auxiliary matrix from SIMD to scalar -# -# ############################################################ -import ../laser_utils/compiler_optim_hints - -template to_ptr*(AB: typed, MR, NR: static int, T: typedesc): untyped = - assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeaddr) - -# ############################################################ -# -# Matrix View -# -# ############################################################ - -type - MatrixView*[T] = object - buffer*: ptr UncheckedArray[T] - rowStride*, colStride*: int - -func toMatrixView*[T](data: ptr T, rowStride, colStride: int): MatrixView[T] {.inline.} = - result.buffer = cast[ptr UncheckedArray[T]](data) - result.rowStride = rowStride - result.colStride = colStride - -template `[]`*[T](view: MatrixView[T], row, col: Natural): T = - ## Access like a 2D matrix - view.buffer[row * view.rowStride + col * view.colStride] - -template `[]=`*[T](view: MatrixView[T], row, col: Natural, value: T) = - ## Access like a 2D matrix - view.buffer[row * view.rowStride + col * view.colStride] = value - -func stride*[T](view: MatrixView[T], row, col: Natural): MatrixView[T]{.inline.}= - ## Returns a new view offset by the row and column stride - result.buffer = cast[ptr UncheckedArray[T]]( - addr view.buffer[row*view.rowStride + col*view.colStride] - ) - result.rowStride = view.rowStride - result.colStride = view.colStride diff --git a/benchmarks/matmul/laser_utils/align_unroller.nim b/benchmarks/matmul/laser_utils/align_unroller.nim deleted file mode 100644 index 6b861f0..0000000 --- a/benchmarks/matmul/laser_utils/align_unroller.nim +++ /dev/null @@ -1,41 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -func round_step_down*(x: Natural, step: static Natural): int {.inline.} = - ## Round the input to the previous multiple of "step" - when (step and (step - 1)) == 0: - # Step is a power of 2. (If compiler cannot prove that x>0 it does not make the optim) - result = x and not(step - 1) - else: - result = x - x mod step - -func round_step_up*(x: Natural, step: static Natural): int {.inline.} = - ## Round the input to the next multiple of "step" - when (step and (step - 1)) == 0: - # Step is a power of 2. (If compiler cannot prove that x>0 it does not make the optim) - result = (x + step - 1) and not(step - 1) - else: - result = ((x + step - 1) div step) * step - -when isMainModule: - doAssert round_step_up(10, 4) == 12 - doAssert round_step_up(10, 8) == 16 - doAssert round_step_up(65, 64) == 128 - doAssert round_step_up(1, 3) == 3 - doAssert round_step_up(19, 24) == 24 - doAssert round_step_up(8, 4) == 8 - doAssert round_step_up(64, 64) == 64 - doAssert round_step_up(24, 24) == 24 - doAssert round_step_up(3, 3) == 3 - - doAssert round_step_down(10, 4) == 8 - doAssert round_step_down(10, 8) == 8 - doAssert round_step_down(65, 64) == 64 - doAssert round_step_down(1, 3) == 0 - doAssert round_step_down(19, 24) == 0 - doAssert round_step_down(8, 4) == 8 - doAssert round_step_down(64, 64) == 64 - doAssert round_step_down(24, 24) == 24 - doAssert round_step_down(3, 3) == 3 diff --git a/benchmarks/matmul/laser_utils/compiler_optim_hints.nim b/benchmarks/matmul/laser_utils/compiler_optim_hints.nim deleted file mode 100644 index 451ffea..0000000 --- a/benchmarks/matmul/laser_utils/compiler_optim_hints.nim +++ /dev/null @@ -1,149 +0,0 @@ -# Laser & Arraymancer -# Copyright (c) 2017-2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -const LASER_MEM_ALIGN*{.intdefine.} = 64 -static: - assert LASER_MEM_ALIGN != 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2" - assert (LASER_MEM_ALIGN and (LASER_MEM_ALIGN - 1)) == 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2" - -template withCompilerOptimHints*() = - # See https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html - # and https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#Common-Variable-Attributes - - # Variable is created aligned by LASER_MEM_ALIGN. - # This is useful to ensure an object can be loaded - # in a minimum amount of cache lines load - # For example, the stack part of tensors are 128 bytes and can be loaded in 2 cache lines - # but would require 3 loads if they are misaligned. - {.pragma: align_variable, codegenDecl: "$# $# __attribute__((aligned(" & $LASER_MEM_ALIGN & ")))".} - - # Variable. Pointer does not alias any existing valid pointers. - when not defined(vcc): - {.pragma: restrict, codegenDecl: "$# __restrict__ $#".} - else: - {.pragma: restrict, codegenDecl: "$# __restrict $#".} - -const withBuiltins = defined(gcc) or defined(clang) or defined(icc) - -type - PrefetchRW* {.size: cint.sizeof.} = enum - Read = 0 - Write = 1 - PrefetchLocality* {.size: cint.sizeof.} = enum - NoTemporalLocality = 0 # Data can be discarded from CPU cache after access - LowTemporalLocality = 1 - ModerateTemporalLocality = 2 - HighTemporalLocality = 3 # Data should be left in all levels of cache possible - # Translation - # 0 - use no cache eviction level - # 1 - L1 cache eviction level - # 2 - L2 cache eviction level - # 3 - L1 and L2 cache eviction level - -when withBuiltins: - proc builtin_assume_aligned(data: pointer, alignment: csize): pointer {.importc: "__builtin_assume_aligned", noDecl.} - proc builtin_prefetch(data: pointer, rw: PrefetchRW, locality: PrefetchLocality) {.importc: "__builtin_prefetch", noDecl.} - -when defined(cpp): - proc static_cast[T: ptr](input: pointer): T - {.importcpp: "static_cast<'0>(@)".} - -template assume_aligned*[T](data: ptr T, alignment: static int = LASER_MEM_ALIGN): ptr T = - when defined(cpp) and withBuiltins: # builtin_assume_aligned returns void pointers, this does not compile in C++, they must all be typed - static_cast[ptr T](builtin_assume_aligned(data, alignment)) - elif withBuiltins: - cast[ptr T](builtin_assume_aligned(data, alignment)) - else: - data - -template prefetch*[T]( - data: ptr (T or UncheckedArray[T]), - rw: static PrefetchRW = Read, - locality: static PrefetchLocality = HighTemporalLocality) = - ## Prefetch examples: - ## - https://scripts.mit.edu/~birge/blog/accelerating-code-using-gccs-prefetch-extension/ - ## - https://stackoverflow.com/questions/7327994/prefetching-examples - ## - https://lemire.me/blog/2018/04/30/is-software-prefetching-__builtin_prefetch-useful-for-performance/ - ## - https://www.naftaliharris.com/blog/2x-speedup-with-one-line-of-code/ - when withBuiltins: - builtin_prefetch(data, rw, locality) - else: - discard - -template pragma_ivdep() = - ## Tell the compiler to ignore unproven loop dependencies - ## such as "a[i] = a[i + k] * c;" if k is unknown, as it introduces a loop - ## dependency if it's negative - ## https://software.intel.com/en-us/node/524501 - ## - ## Placeholder - # We don't expose that as it only works on C for loop. Nim only generates while loop - # except when using OpenMP. But the OpenMP "simd" already achieves the same as ivdep. - when defined(gcc): - {.emit: "#pragma GCC ivdep".} - else: # Supported on ICC and Cray - {.emit: "pragma ivdep".} - -template withCompilerFunctionHints() = - ## Not exposed, Nim codegen will declare them as normal C function. - ## This messes up with N_NIMCALL, N_LIB_PRIVATE, N_INLINE and also - ## creates duplicate symbols when one function called by a hot or pure function - ## is public and inline (because hot and pure cascade to all cunfctions called) - ## and they cannot be stacked easily: (hot, pure) will only apply the last - - # Function. Returned pointer is aligned to LASER_MEM_ALIGN - {.pragma: aligned_ptr_result, codegenDecl: "__attribute__((assume_aligned(" & $LASER_MEM_ALIGN & ")) $# $#$#".} - - # Function. Returned pointer cannot alias any other valid pointer and no pointers to valid object occur in any - # storage pointed to. - {.pragma: malloc, codegenDecl: "__attribute__((malloc)) $# $#$#".} - - # Function. Creates one or more function versions that can process multiple arguments using SIMD. - # Ignored when -fopenmp is used and within an OpenMP simd loop - {.pragma: simd, codegenDecl: "__attribute__((simd)) $# $#$#".} - - # Function. Indicates hot and cold path. Ignored when using profile guided optimization. - {.pragma: hot, codegenDecl: "__attribute__((hot)) $# $#$#".} - {.pragma: cold, codegenDecl: "__attribute__((cold)) $# $#$#".} - - # ## pure and const - # ## Affect Common Sub-expression Elimination, Dead Code Elimination and loop optimization. - # See - # - https://lwn.net/Articles/285332/ - # - http://benyossef.com/helping-the-compiler-help-you/ - # - # Function. The function only accesses its input params and global variables state. - # It does not modify any global, calling it multiple times with the same params - # and global variables will produce the same result. - {.pragma: gcc_pure, codegenDecl: "__attribute__((pure)) $# $#$#".} - # - # Function. The function only accesses its input params and calling it multiple times - # with the same params will produce the same result. - # Warning ⚠: - # Pointer inputs must not be dereferenced to read the memory pointed to. - # In Nim stack arrays are passed by pointers and big stack data structures - # are passed by reference as well. I.e. Result unknown. - {.pragma: gcc_const, codegenDecl: "__attribute__((const)) $# $#$#".} - - # We don't define per-function fast-math, GCC attribute optimize is broken: - # --> https://gcc.gnu.org/ml/gcc/2009-10/msg00402.html - # - # Workaround floating point latency for algorithms like sum - # should be done manually. - # - # See : https://stackoverflow.com/questions/39095993/does-each-floating-point-operation-take-the-same-time - # and https://www.agner.org/optimize/vectorclass.pdf "Using multiple accumulators" - # - # FP addition has a latency of 3~5 clock cycles, i.e. the result cannot be reused for that much time. - # But the throughput is 1 FP add per clock cycle (and even 2 per clock cycle for Skylake) - # So we need to use extra accumulators to fully utilize the FP throughput despite FP latency. - # On Skylake, all FP latencies are 4: https://www.agner.org/optimize/blog/read.php?i=415 - # - # Note that this is per CPU cores, each core needs its own "global CPU accumulator" to combat - # false sharing when multithreading. - # - # This wouldn't be needed with fast-math because compiler would consider FP addition associative - # and create intermediate variables as needed to exploit this through put. - diff --git a/benchmarks/matmul/laser_utils/memory.nim b/benchmarks/matmul/laser_utils/memory.nim deleted file mode 100644 index 7b84e81..0000000 --- a/benchmarks/matmul/laser_utils/memory.nim +++ /dev/null @@ -1,20 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -import ./compiler_optim_hints, typetraits - -func align_raw_data*(T: typedesc, p: pointer): ptr UncheckedArray[T] = - static: assert T.supportsCopyMem - withCompilerOptimHints() - - let address = cast[ByteAddress](p) - let aligned_ptr{.restrict.} = block: # We cannot directly apply restrict to the default "result" - let remainder = address and (LASER_MEM_ALIGN - 1) # modulo LASER_MEM_ALIGN (power of 2) - if remainder == 0: - assume_aligned cast[ptr UncheckedArray[T]](address) - else: - let offset = LASER_MEM_ALIGN - remainder - assume_aligned cast[ptr UncheckedArray[T]](address +% offset) - return aligned_ptr diff --git a/benchmarks/matmul/laser_utils/openmp.nim b/benchmarks/matmul/laser_utils/openmp.nim deleted file mode 100644 index 092dfaa..0000000 --- a/benchmarks/matmul/laser_utils/openmp.nim +++ /dev/null @@ -1,386 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -# ############################################################### -# Compile-time name mangling for OpenMP thresholds -# Workaround https://github.com/nim-lang/Nim/issues/9365 -# and https://github.com/nim-lang/Nim/issues/9366 -import random -from strutils import toHex - -var mangling_rng {.compileTime.} = initRand(0x1337DEADBEEF) -var current_suffix {.compileTime.} = "" - -proc omp_suffix*(genNew: static bool = false): string {.compileTime.} = - ## genNew: - ## if false, return the last suffix - ## else return a fresh one - # This is exported because you cannot bind the symbol early enough - # for exportc - - if genNew: - current_suffix = mangling_rng.rand(high(uint32)).toHex - result = current_suffix - -# ################################################################ -# Tuning - -when defined(openmp): - {.passC: "-fopenmp".} - {.passL: "-fopenmp".} - - {.pragma: omp, header:"omp.h".} - - proc omp_set_num_threads*(x: cint) {.omp.} - proc omp_get_num_threads*(): cint {.omp.} - proc omp_get_max_threads*(): cint {.omp.} # This takes hyperthreading into account - proc omp_get_thread_num*(): cint {.omp.} - proc omp_set_nested*(x: cint) {.omp.} - proc omp_get_nested*(): cint {.omp.} - -else: - template omp_set_num_threads*(x: cint) = discard - template omp_get_num_threads*(): cint = 1 - template omp_get_max_threads*(): cint = 1 - template omp_get_thread_num*(): cint = 0 - template omp_set_nested*(x: cint) = discard - template omp_get_nested*(): cint = cint 0 - -# TODO tuning for architectures -# https://github.com/zy97140/omp-benchmark-for-pytorch -# https://github.com/zy97140/omp-benchmark-for-pytorch/blob/master/benchmark-data/IntelR-XeonR-CPU-E5-2669-v4.md -# https://github.com/zy97140/omp-benchmark-for-pytorch/blob/master/benchmark-data/IntelR-XeonR-Platinum-8180-CPU.md - - -const OMP_MEMORY_BOUND_GRAIN_SIZE*{.intdefine.} = 1024 - ## This is the minimum amount of work per physical cores - ## for memory-bound processing. - ## - "copy" and "addition" are considered memory-bound - ## - "float division" can be considered 2x~4x more complex - ## and should be scaled down accordingly - ## - "exp" and "sin" operations are compute-bound and - ## there is a perf boost even when processing - ## only 1000 items on 28 cores - ## - ## Launching 2 threads per core (HyperThreading) is probably desirable: - ## - https://medium.com/data-design/destroying-the-myth-of-number-of-threads-number-of-physical-cores-762ad3919880 - ## - ## Raising the following parameters can have the following impact: - ## - number of sockets: higher, more over memory fetch - ## - number of memory channel: lower, less overhead per memory fetch - ## - RAM speed: lower, less overhead per memory fetch - ## - Private L2 cache: higher, feed more data per CPU - ## - Hyperthreading and cache associativity - ## - Cores, shared L3 cache: Memory contention - ## - ## Note that setting num_threads manually might impact performance negatively: - ## - http://studio.myrian.fr/openmp-et-num_threads/ - ## > 2x2ms overhead when changing num_threads from 16->6->16 - -const OMP_NON_CONTIGUOUS_SCALE_FACTOR*{.intdefine.} = 4 - ## Due to striding computation, we can use a lower grainsize - ## for non-contiguous tensors - -# ################################################################ - -template attachGC*(): untyped = - ## If you are allocating reference types, sequences or strings - ## in a parallel section, you need to attach and detach - ## a GC for each thread. Those should be thread-local temporaries. - ## - ## This attaches the GC. - ## - ## Note: this creates too strange error messages - ## when --threads is not on: https://github.com/nim-lang/Nim/issues/9489 - if(omp_get_thread_num()!=0): - setupForeignThreadGc() - -template detachGC*(): untyped = - ## If you are allocating reference types, sequences or strings - ## in a parallel section, you need to attach and detach - ## a GC for each thread. Those should be thread-local temporaries. - ## - ## This detaches the GC. - ## - ## Note: this creates too strange error messages - ## when --threads is not on: https://github.com/nim-lang/Nim/issues/9489 - if(omp_get_thread_num()!=0): - teardownForeignThreadGc() - -template omp_parallel*(body: untyped): untyped = - ## Starts an openMP parallel section - ## - ## Don't forget to use attachGC and detachGC if you are allocating - ## sequences, strings, or reference types. - ## Those should be thread-local temporaries. - {.emit: "#pragma omp parallel".} - block: body - -template omp_parallel_if*(condition: bool, body: untyped) = - let predicate = condition # Make symbol valid and ensure it's lvalue - {.emit: "#pragma omp parallel if (`predicate`)".} - block: body - -template omp_for*( - index: untyped, - length: Natural, - use_simd, nowait: static bool, - body: untyped - ) = - ## OpenMP for loop (not parallel) - ## - ## This must be used in an `omp_parallel` block - ## for parallelization. - ## - ## Inputs: - ## - `index`, the iteration index, similar to - ## for `index` in 0 ..< length: - ## doSomething(`index`) - ## - `length`, the number of elements to iterate on - ## - `use_simd`, instruct the compiler to unroll the loops for `simd` use. - ## For example, for float32: - ## for i in 0..<16: - ## x[i] += y[i] - ## will be unrolled to take 128, 256 or 512-bit to use SSE, AVX or AVX512. - ## for 256-bit AVX: - ## for i in countup(0, 2, 8): # Step 8 by 8 - ## x[i] += y[i] - ## x[i+1] += y[i+1] - ## x[i+2] += y[i+2] - ## ... - const omp_annotation = block: - "for " & - (when use_simd: "simd " else: "") & - (when nowait: "nowait " else: "") - for `index`{.inject.} in `||`(0, length-1, omp_annotation): - block: body - -template omp_parallel_for*( - index: untyped, - length: Natural, - omp_grain_size: static Natural, - use_simd: static bool, - body: untyped - ) = - ## Parallel for loop - ## - ## Do not forget to use attachGC and detachGC if you are allocating - ## sequences, strings, or reference types. - ## Those should be thread-local temporaries. - ## - ## Inputs: - ## - `index`, the iteration index, similar to - ## for `index` in 0 ..< length: - ## doSomething(`index`) - ## - `length`, the number of elements to iterate on - ## - `omp_grain_size`, the minimal amount of work per thread. If below, - ## we don't start threads. Note that we always start as much hardware threads - ## as available as starting varying number of threads in the lifetime of the program - ## will add oberhead. - ## - `use_simd`, instruct the compiler to unroll the loops for `simd` use. - ## For example, for float32: - ## for i in 0..<16: - ## x[i] += y[i] - ## will be unrolled to take 128, 256 or 512-bit to use SSE, AVX or AVX512. - ## for 256-bit AVX: - ## for i in countup(0, 2, 8): # Step 8 by 8 - ## x[i] += y[i] - ## x[i+1] += y[i+1] - ## x[i+2] += y[i+2] - ## ... - when not defined(openmp): - ## When OpenMP is not defined we use this simple loop as fallback - ## This way, the compiler will still be provided "simd" vectorization hints - when use_simd: - const omp_annotation = "parallel for simd" - else: - const omp_annotation = "parallel for" - for `index`{.inject.} in `||`(0, length-1, omp_annotation): - block: body - else: - let omp_size = length # make sure if length is computed it's only done once - - const # Workaround to expose an unique symbol in C. TODO Pending OpenMP interpolation: https://github.com/nim-lang/Nim/issues/9365 - omp_condition_csym = "omp_condition_" & omp_suffix(genNew = true) - let omp_condition {.exportc: "omp_condition_" & # We cannot use csym directly in exportc - omp_suffix().} = omp_grain_size * omp_get_max_threads() < omp_size - - const omp_annotation = block: - "parallel for " & - (when use_simd: "simd " else: "") & - "if(" & $omp_condition_csym & ")" - - for `index`{.inject.} in `||`(0, omp_size - 1, omp_annotation): - block: body - -template omp_parallel_for_default*( - index: untyped, - length: Natural, - body: untyped - ) = - ## This will be renamed omp_parallel_for once - ## https://github.com/nim-lang/Nim/issues/9414 is solved. - ## Compared to omp_parallel_for the following are set by default - ## - omp_grain_size: - ## The default `OMP_MEMORY_BOUND_GRAIN_SIZE` is suitable for - ## contiguous copy or add operations. It's 1024 and can be changed - ## by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation. - ## A value of 1 will always parallelize the loop. - ## - simd is used by default - omp_parallel_for( - index, - length, - omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE, - use_simd = true, - body - ) - -template omp_chunks*( - omp_size: Natural, #{lvalue} # TODO parameter constraint, pending https://github.com/nim-lang/Nim/issues/9620 - chunk_offset, chunk_size: untyped, - body: untyped): untyped = - ## Internal proc - ## This is is the chunk part of omp_parallel_chunk - ## omp_size should be a lvalue (assigned value) and not - ## the result of a routine otherwise routine and its side-effect will be called multiple times - - # The following simple chunking scheme can lead to severe load imbalance - # - # `chunk_offset`{.inject.} = chunk_size * thread_id - # `chunk_size`{.inject.} = if thread_id < nb_chunks - 1: chunk_size - # else: omp_size - chunk_offset - # - # For example dividing 40 items on 12 threads will lead to - # a base_chunk_size of 40/12 = 3 so work on the first 11 threads - # will be 3 * 11 = 33, and the remainder 7 on the last thread. - let - nb_chunks = omp_get_num_threads() - base_chunk_size = omp_size div nb_chunks - remainder = omp_size mod nb_chunks - thread_id = omp_get_thread_num() - - # Instead of dividing 40 work items on 12 cores into: - # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40 - # the following scheme will divide into - # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40 - # - # This is compliant with OpenMP spec (page 60) - # http://www.openmp.org/mp-documents/openmp-4.5.pdf - # "When no chunk_size is specified, the iteration space is divided into chunks - # that are approximately equal in size, and at most one chunk is distributed to - # each thread. The size of the chunks is unspecified in this case." - # ---> chunks are the same ±1 - - var `chunk_offset`{.inject.}, `chunk_size`{.inject.}: Natural - if thread_id < remainder: - chunk_offset = (base_chunk_size + 1) * thread_id - chunk_size = base_chunk_size + 1 - else: - chunk_offset = base_chunk_size * thread_id + remainder - chunk_size = base_chunk_size - - block: body - -template omp_parallel_chunks*( - length: Natural, - chunk_offset, chunk_size: untyped, - omp_grain_size: static Natural, - body: untyped): untyped = - ## Create a chunk for each thread. You can use: - ## `for index in chunk_offset ..< chunk_size:` or - ## `zeroMem(foo[chunk_offset].addr, chunk_size)` - ## - ## Splits the input `length` into chunks and do a parallel loop - ## on each chunk. The number of chunks depends on the number of cores at runtime. - ## `chunk_offset` and `chunk_size` should be passed as undeclared identifiers. - ## Within the template scope they will contain the start offset and the length - ## of the current thread chunk. I.e. their value is thread-specific. - ## - ## Use omp_get_thread_num() to get the current thread number - ## - ## This is useful for non-contiguous processing as a replacement to omp_parallel_for - ## or when operating on (contiguous) ranges for example for memset or memcpy. - ## - ## Do not forget to use attachGC and detachGC if you are allocating - ## sequences, strings, or reference types. - ## Those should be thread-local temporaries. - when not defined(openmp): - const `chunk_offset`{.inject.} = 0 - let `chunk_size`{.inject.} = length - block: body - else: - let omp_size = length # make sure if length is computed it's only done once - let over_threshold = omp_grain_size * omp_get_max_threads() < omp_size - - omp_parallel_if(over_threshold): - omp_chunks(omp_size, chunk_offset, chunk_size, body) - -template omp_parallel_chunks_default*( - length: Natural, - chunk_offset, chunk_size: untyped, - body: untyped): untyped = - ## This will be renamed omp_parallel_chunks once - ## https://github.com/nim-lang/Nim/issues/9414 is solved. - ## Compared to omp_parallel_for the following are set by default - ## - omp_grain_size: - ## The default `OMP_MEMORY_BOUND_GRAIN_SIZE` is suitable for - ## contiguous copy or add operations. It's 1024 and can be changed - ## by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation. - ## A value of 1 will always parallelize the loop. - omp_parallel_chunks( - length, - chunk_offset, chunk_size, - omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE, - body - ) - -template omp_critical*(body: untyped): untyped = - {.emit: "#pragma omp critical".} - block: body - -template omp_master*(body: untyped): untyped = - {.emit: "#pragma omp master".} - block: body - -template omp_single*(body: untyped): untyped = - {.emit: "#pragma omp single".} - block: body - -template omp_single_nowait*(body: untyped): untyped = - {.emit: "#pragma omp single nowait".} - block: body - -template omp_barrier*(): untyped = - {.emit: "#pragma omp barrier".} - -template omp_task*(annotation: static string, body: untyped): untyped = - {.emit: "#pragma omp task " & annotation.} - block: body - -template omp_taskwait*(): untyped = - {.emit: "#pragma omp taskwait".} - -template omp_taskloop*( - index: untyped, - length: Natural, - annotation: static string, - body: untyped - ) = - ## OpenMP taskloop - const omp_annotation = "taskloop " & annotation - for `index`{.inject.} in `||`(0, length-1, omp_annotation): - block: body - -import macros -macro omp_flush*(variables: varargs[untyped]): untyped = - var listvars = "(" - for i, variable in variables: - if i == 0: - listvars.add "`" & $variable & "`" - else: - listvars.add ",`" & $variable & "`" - listvars.add ')' - result = quote do: - {.emit: "#pragma omp flush " & `listvars`.} diff --git a/benchmarks/matmul/laser_utils/simd.nim b/benchmarks/matmul/laser_utils/simd.nim deleted file mode 100644 index 52f84f2..0000000 --- a/benchmarks/matmul/laser_utils/simd.nim +++ /dev/null @@ -1,441 +0,0 @@ -# Laser -# Copyright (c) 2018 Mamy André-Ratsimbazafy -# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). -# This file may not be copied, modified, or distributed except according to those terms. - -when defined(i386) or defined(amd64): - # SIMD throughput and latency: - # - https://software.intel.com/sites/landingpage/IntrinsicsGuide/ - # - https://www.agner.org/optimize/instruction_tables.pdf - - # Reminder: x86 is little-endian, order is [low part, high part] - # Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/ - - when defined(vcc): - {.pragma: x86_type, byCopy, header:"".} - {.pragma: x86, noDecl, header:"".} - else: - {.pragma: x86_type, byCopy, header:"".} - {.pragma: x86, noDecl, header:"".} - type - m128* {.importc: "__m128", x86_type.} = object - raw: array[4, float32] - m128d* {.importc: "__m128d", x86_type.} = object - raw: array[2, float64] - m128i* {.importc: "__m128i", x86_type.} = object - raw: array[16, byte] - m256* {.importc: "__m256", x86_type.} = object - raw: array[8, float32] - m256d* {.importc: "__m256d", x86_type.} = object - raw: array[4, float64] - m256i* {.importc: "__m256i", x86_type.} = object - raw: array[32, byte] - m512* {.importc: "__m512", x86_type.} = object - raw: array[16, float32] - m512d* {.importc: "__m512d", x86_type.} = object - raw: array[8, float64] - m512i* {.importc: "__m512i", x86_type.} = object - raw: array[64, byte] - mmask16* {.importc: "__mmask16", x86_type.} = distinct uint16 - mmask64* {.importc: "__mmask64", x86_type.} = distinct uint64 - - # ############################################################ - # - # SSE - float32 - packed - # - # ############################################################ - - func mm_setzero_ps*(): m128 {.importc: "_mm_setzero_ps", x86.} - func mm_set1_ps*(a: float32): m128 {.importc: "_mm_set1_ps", x86.} - func mm_load_ps*(aligned_mem_addr: ptr float32): m128 {.importc: "_mm_load_ps", x86.} - func mm_loadu_ps*(data: ptr float32): m128 {.importc: "_mm_loadu_ps", x86.} - func mm_store_ps*(mem_addr: ptr float32, a: m128) {.importc: "_mm_store_ps", x86.} - func mm_storeu_ps*(mem_addr: ptr float32, a: m128) {.importc: "_mm_storeu_ps", x86.} - func mm_add_ps*(a, b: m128): m128 {.importc: "_mm_add_ps", x86.} - func mm_sub_ps*(a, b: m128): m128 {.importc: "_mm_sub_ps", x86.} - func mm_mul_ps*(a, b: m128): m128 {.importc: "_mm_mul_ps", x86.} - func mm_max_ps*(a, b: m128): m128 {.importc: "_mm_max_ps", x86.} - func mm_min_ps*(a, b: m128): m128 {.importc: "_mm_min_ps", x86.} - func mm_or_ps*(a, b: m128): m128 {.importc: "_mm_or_ps", x86.} - - # ############################################################ - # - # SSE - float32 - scalar - # - # ############################################################ - - func mm_load_ss*(aligned_mem_addr: ptr float32): m128 {.importc: "_mm_load_ss", x86.} - func mm_add_ss*(a, b: m128): m128 {.importc: "_mm_add_ss", x86.} - func mm_max_ss*(a, b: m128): m128 {.importc: "_mm_max_ss", x86.} - func mm_min_ss*(a, b: m128): m128 {.importc: "_mm_min_ss", x86.} - - func mm_cvtss_f32*(a: m128): float32 {.importc: "_mm_cvtss_f32", x86.} - ## Extract the low part of the input - ## Input: - ## { A0, A1, A2, A3 } - ## Result: - ## A0 - - func mm_movehl_ps*(a, b: m128): m128 {.importc: "_mm_movehl_ps", x86.} - ## Input: - ## { A0, A1, A2, A3 }, { B0, B1, B2, B3 } - ## Result: - ## { B2, B3, A2, A3 } - func mm_movelh_ps*(a, b: m128): m128 {.importc: "_mm_movelh_ps", x86.} - ## Input: - ## { A0, A1, A2, A3 }, { B0, B1, B2, B3 } - ## Result: - ## { A0, A1, B0, B1 } - - # ############################################################ - # - # SSE2 - float64 - packed - # - # ############################################################ - - func mm_setzero_pd*(): m128d {.importc: "_mm_setzero_pd", x86.} - func mm_set1_pd*(a: float64): m128d {.importc: "_mm_set1_pd", x86.} - func mm_load_pd*(aligned_mem_addr: ptr float64): m128d {.importc: "_mm_load_pd", x86.} - func mm_loadu_pd*(mem_addr: ptr float64): m128d {.importc: "_mm_loadu_pd", x86.} - func mm_store_pd*(mem_addr: ptr float64, a: m128d) {.importc: "_mm_store_pd", x86.} - func mm_storeu_pd*(mem_addr: ptr float64, a: m128d) {.importc: "_mm_storeu_pd", x86.} - func mm_add_pd*(a, b: m128d): m128d {.importc: "_mm_add_pd", x86.} - func mm_sub_pd*(a, b: m128d): m128d {.importc: "_mm_sub_pd", x86.} - func mm_mul_pd*(a, b: m128d): m128d {.importc: "_mm_mul_pd", x86.} - - # ############################################################ - # - # SSE2 - integer - packed - # - # ############################################################ - - func mm_setzero_si128*(): m128i {.importc: "_mm_setzero_si128", x86.} - func mm_set1_epi8*(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.} - func mm_set1_epi16*(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.} - func mm_set1_epi32*(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.} - func mm_set1_epi64x*(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.} - func mm_load_si128*(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.} - func mm_loadu_si128*(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.} - func mm_storeu_si128*(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.} - func mm_add_epi8*(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.} - func mm_add_epi16*(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.} - func mm_add_epi32*(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.} - func mm_add_epi64*(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.} - - func mm_or_si128*(a, b: m128i): m128i {.importc: "_mm_or_si128", x86.} - func mm_and_si128*(a, b: m128i): m128i {.importc: "_mm_and_si128", x86.} - func mm_slli_epi64*(a: m128i, imm8: cint): m128i {.importc: "_mm_slli_epi64", x86.} - ## Shift 2xint64 left - func mm_srli_epi64*(a: m128i, imm8: cint): m128i {.importc: "_mm_srli_epi64", x86.} - ## Shift 2xint64 right - func mm_srli_epi32*(a: m128i, count: int32): m128i {.importc: "_mm_srli_epi32", x86.} - func mm_slli_epi32*(a: m128i, count: int32): m128i {.importc: "_mm_slli_epi32", x86.} - - func mm_mullo_epi16*(a, b: m128i): m128i {.importc: "_mm_mullo_epi16", x86.} - ## Multiply element-wise 2 vectors of 8 16-bit ints - ## into intermediate 8 32-bit ints, and keep the low 16-bit parts - - func mm_shuffle_epi32*(a: m128i, imm8: cint): m128i {.importc: "_mm_shuffle_epi32", x86.} - ## Shuffle 32-bit integers in a according to the control in imm8 - ## Formula is in big endian representation - ## a = {a3, a2, a1, a0} - ## dst = {d3, d2, d1, d0} - ## imm8 = {bits76, bits54, bits32, bits10} - ## d0 will refer a[bits10] - ## d1 a[bits32] - - func mm_mul_epu32*(a: m128i, b: m128i): m128i {.importc: "_mm_mul_epu32", x86.} - ## From a = {a1_hi, a1_lo, a0_hi, a0_lo} with a1 and a0 being 64-bit number - ## and b = {b1_hi, b1_lo, b0_hi, b0_lo} - ## - ## Result = {a1_lo * b1_lo, a0_lo * b0_lo}. - ## This is an extended precision multiplication 32x32 -> 64 - - func mm_set_epi32*(e3, e2, e1, e0: cint): m128i {.importc: "_mm_set_epi32", x86.} - ## Initialize m128i with {e3, e2, e1, e0} (big endian order) - ## Storing it will yield [e0, e1, e2, e3] - - func mm_castps_si128*(a: m128): m128i {.importc: "_mm_castps_si128", x86.} - ## Cast a float32x4 vectors into a 128-bit int vector with the same bit pattern - func mm_castsi128_ps*(a: m128i): m128 {.importc: "_mm_castsi128_ps", x86.} - ## Cast a 128-bit int vector into a float32x8 vector with the same bit pattern - func mm_cvtps_epi32*(a: m128): m128i {.importc: "_mm_cvtps_epi32", x86.} - ## Convert a float32x4 to int32x4 - func mm_cvtepi32_ps*(a: m128i): m128 {.importc: "_mm_cvtepi32_ps", x86.} - ## Convert a int32x4 to float32x4 - - func mm_cmpgt_epi32*(a, b: m128i): m128i {.importc: "_mm_cmpgt_epi32", x86.} - ## Compare a greater than b - - func mm_cvtsi128_si32*(a: m128i): cint {.importc: "_mm_cvtsi128_si32", x86.} - ## Copy the low part of a to int32 - - func mm_extract_epi16*(a: m128i, imm8: cint): cint {.importc: "_mm_extract_epi16", x86.} - ## Extract an int16 from a, selected with imm8 - ## and store it in the lower part of destination (padded with zeroes) - - func mm_movemask_epi8*(a: m128i): int32 {.importc: "_mm_movemask_epi8", x86.} - ## Returns the most significant bit - ## of each 8-bit elements in `a` - - # ############################################################ - # - # SSE3 - float32 - # - # ############################################################ - - func mm_movehdup_ps*(a: m128): m128 {.importc: "_mm_movehdup_ps", x86.} - ## Duplicates high parts of the input - ## Input: - ## { A0, A1, A2, A3 } - ## Result: - ## { A1, A1, A3, A3 } - func mm_moveldup_ps*(a: m128): m128 {.importc: "_mm_moveldup_ps", x86.} - ## Duplicates low parts of the input - ## Input: - ## { A0, A1, A2, A3 } - ## Result: - ## { A0, A0, A2, A2 } - - # ############################################################ - # - # SSE4.1 - integer - packed - # - # ############################################################ - - func mm_mullo_epi32*(a, b: m128i): m128i {.importc: "_mm_mullo_epi32", x86.} - ## Multiply element-wise 2 vectors of 4 32-bit ints - ## into intermediate 4 64-bit ints, and keep the low 32-bit parts - - # ############################################################ - # - # AVX - float32 - packed - # - # ############################################################ - - func mm256_setzero_ps*(): m256 {.importc: "_mm256_setzero_ps", x86.} - func mm256_set1_ps*(a: float32): m256 {.importc: "_mm256_set1_ps", x86.} - func mm256_load_ps*(aligned_mem_addr: ptr float32): m256 {.importc: "_mm256_load_ps", x86.} - func mm256_loadu_ps*(mem_addr: ptr float32): m256 {.importc: "_mm256_loadu_ps", x86.} - func mm256_store_ps*(mem_addr: ptr float32, a: m256) {.importc: "_mm256_store_ps", x86.} - func mm256_storeu_ps*(mem_addr: ptr float32, a: m256) {.importc: "_mm256_storeu_ps", x86.} - func mm256_add_ps*(a, b: m256): m256 {.importc: "_mm256_add_ps", x86.} - func mm256_mul_ps*(a, b: m256): m256 {.importc: "_mm256_mul_ps", x86.} - func mm256_sub_ps*(a, b: m256): m256 {.importc: "_mm256_sub_ps", x86.} - - func mm256_and_ps*(a, b: m256): m256 {.importc: "_mm256_and_ps", x86.} - ## Bitwise and - func mm256_or_ps*(a, b: m256): m256 {.importc: "_mm256_or_ps", x86.} - - func mm256_min_ps*(a, b: m256): m256 {.importc: "_mm256_min_ps", x86.} - func mm256_max_ps*(a, b: m256): m256 {.importc: "_mm256_max_ps", x86.} - func mm256_castps256_ps128*(a: m256): m128 {.importc: "_mm256_castps256_ps128", x86.} - ## Returns the lower part of a m256 in a m128 - func mm256_extractf128_ps*(v: m256, m: cint{lit}): m128 {.importc: "_mm256_extractf128_ps", x86.} - ## Extracts the low part (m = 0) or high part (m = 1) of a m256 into a m128 - ## m must be a literal - - # ############################################################ - # - # AVX - float64 - packed - # - # ############################################################ - - func mm256_setzero_pd*(): m256d {.importc: "_mm256_setzero_pd", x86.} - func mm256_set1_pd*(a: float64): m256d {.importc: "_mm256_set1_pd", x86.} - func mm256_load_pd*(aligned_mem_addr: ptr float64): m256d {.importc: "_mm256_load_pd", x86.} - func mm256_loadu_pd*(mem_addr: ptr float64): m256d {.importc: "_mm256_loadu_pd", x86.} - func mm256_store_pd*(mem_addr: ptr float64, a: m256d) {.importc: "_mm256_store_pd", x86.} - func mm256_storeu_pd*(mem_addr: ptr float64, a: m256d) {.importc: "_mm256_storeu_pd", x86.} - func mm256_add_pd*(a, b: m256d): m256d {.importc: "_mm256_add_pd", x86.} - func mm256_mul_pd*(a, b: m256d): m256d {.importc: "_mm256_mul_pd", x86.} - - # ############################################################ - # - # AVX + FMA - float32/64 - packed - # - # ############################################################ - - func mm256_fmadd_ps*(a, b, c: m256): m256 {.importc: "_mm256_fmadd_ps", x86.} - func mm256_fmadd_pd*(a, b, c: m256d): m256d {.importc: "_mm256_fmadd_pd", x86.} - - # ############################################################ - # - # AVX - integers - packed - # - # ############################################################ - - func mm256_setzero_si256*(): m256i {.importc: "_mm256_setzero_si256", x86.} - func mm256_set1_epi8*(a: int8 or uint8): m256i {.importc: "_mm256_set1_epi8", x86.} - func mm256_set1_epi16*(a: int16 or uint16): m256i {.importc: "_mm256_set1_epi16", x86.} - func mm256_set1_epi32*(a: int32 or uint32): m256i {.importc: "_mm256_set1_epi32", x86.} - func mm256_set1_epi64x*(a: int64 or uint64): m256i {.importc: "_mm256_set1_epi64x", x86.} - func mm256_load_si256*(mem_addr: ptr m256i): m256i {.importc: "_mm256_load_si256", x86.} - func mm256_loadu_si256*(mem_addr: ptr m256i): m256i {.importc: "_mm256_loadu_si256", x86.} - func mm256_storeu_si256*(mem_addr: ptr m256i, a: m256i) {.importc: "_mm256_storeu_si256", x86.} - - func mm256_castps_si256*(a: m256): m256i {.importc: "_mm256_castps_si256", x86.} - ## Cast a float32x8 vectors into a 256-bit int vector with the same bit pattern - func mm256_castsi256_ps*(a: m256i): m256 {.importc: "_mm256_castsi256_ps", x86.} - ## Cast a 256-bit int vector into a float32x8 vector with the same bit pattern - func mm256_cvtps_epi32*(a: m256): m256i {.importc: "_mm256_cvtps_epi32", x86.} - ## Convert a float32x8 to int32x8 - func mm256_cvtepi32_ps*(a: m256i): m256 {.importc: "_mm256_cvtepi32_ps", x86.} - ## Convert a int32x8 to float32x8 - - # ############################################################ - # - # AVX2 - integers - packed - # - # ############################################################ - - func mm256_add_epi8*(a, b: m256i): m256i {.importc: "_mm256_add_epi8", x86.} - func mm256_add_epi16*(a, b: m256i): m256i {.importc: "_mm256_add_epi16", x86.} - func mm256_add_epi32*(a, b: m256i): m256i {.importc: "_mm256_add_epi32", x86.} - func mm256_add_epi64*(a, b: m256i): m256i {.importc: "_mm256_add_epi64", x86.} - - func mm256_and_si256*(a, b: m256i): m256i {.importc: "_mm256_and_si256", x86.} - ## Bitwise and - func mm256_srli_epi64*(a: m256i, imm8: cint): m256i {.importc: "_mm256_srli_epi64", x86.} - ## Logical right shift - - func mm256_mullo_epi16*(a, b: m256i): m256i {.importc: "_mm256_mullo_epi16", x86.} - ## Multiply element-wise 2 vectors of 16 16-bit ints - ## into intermediate 16 32-bit ints, and keep the low 16-bit parts - - func mm256_mullo_epi32*(a, b: m256i): m256i {.importc: "_mm256_mullo_epi32", x86.} - ## Multiply element-wise 2 vectors of 8x 32-bit ints - ## into intermediate 8x 64-bit ints, and keep the low 32-bit parts - - func mm256_shuffle_epi32*(a: m256i, imm8: cint): m256i {.importc: "_mm256_shuffle_epi32", x86.} - ## Shuffle 32-bit integers in a according to the control in imm8 - ## Formula is in big endian representation - ## a = {hi[a7, a6, a5, a4, lo[a3, a2, a1, a0]} - ## dst = {d7, d6, d5, d4, d3, d2, d1, d0} - ## imm8 = {bits76, bits54, bits32, bits10} - ## d0 will refer a.lo[bits10] - ## d1 a.lo[bits32] - ## ... - ## d4 will refer a.hi[bits10] - ## d5 a.hi[bits32] - - func mm256_mul_epu32*(a: m256i, b: m256i): m256i {.importc: "_mm256_mul_epu32", x86.} - ## From a = {a3_hi, a3_lo, a2_hi, a2_lo, a1_hi, a1_lo, a0_hi, a0_lo} - ## with a3, a2, a1, a0 being 64-bit number - ## and b = {b3_hi, b3_lo, b2_hi, b2_lo, b1_hi, b1_lo, b0_hi, b0_lo} - ## - ## Result = {a3_lo * b3_lo, a2_lo * b2_lo, a1_lo * b1_lo, a0_lo * b0_lo}. - ## This is an extended precision multiplication 32x32 -> 64 - - func mm256_movemask_epi8*(a: m256i): int32 {.importc: "_mm256_movemask_epi8", x86.} - ## Returns the most significant bit - ## of each 8-bit elements in `a` - - func mm256_cmpgt_epi32*(a, b: m256i): m256i {.importc: "_mm256_cmpgt_epi32", x86.} - ## Compare a greater than b - - func mm256_srli_epi32*(a: m256i, count: int32): m256i {.importc: "_mm256_srli_epi32", x86.} - func mm256_slli_epi32*(a: m256i, count: int32): m256i {.importc: "_mm256_slli_epi32", x86.} - - func mm_i32gather_epi32*(m: ptr (uint32 or int32), i: m128i, s: int32): m128i {.importc: "_mm_i32gather_epi32", x86.} - func mm256_i32gather_epi32*(m: ptr (uint32 or int32), i: m256i, s: int32): m256i {.importc: "_mm256_i32gather_epi32", x86.} - - # ############################################################ - # - # AVX512 - float32 - packed - # - # ############################################################ - - func mm512_setzero_ps*(): m512 {.importc: "_mm512_setzero_ps", x86.} - func mm512_set1_ps*(a: float32): m512 {.importc: "_mm512_set1_ps", x86.} - func mm512_load_ps*(aligned_mem_addr: ptr float32): m512 {.importc: "_mm512_load_ps", x86.} - func mm512_loadu_ps*(mem_addr: ptr float32): m512 {.importc: "_mm512_loadu_ps", x86.} - func mm512_store_ps*(mem_addr: ptr float32, a: m512) {.importc: "_mm512_store_ps", x86.} - func mm512_storeu_ps*(mem_addr: ptr float32, a: m512) {.importc: "_mm512_storeu_ps", x86.} - func mm512_add_ps*(a, b: m512): m512 {.importc: "_mm512_add_ps", x86.} - func mm512_sub_ps*(a, b: m512): m512 {.importc: "_mm512_sub_ps", x86.} - func mm512_mul_ps*(a, b: m512): m512 {.importc: "_mm512_mul_ps", x86.} - func mm512_fmadd_ps*(a, b, c: m512): m512 {.importc: "_mm512_fmadd_ps", x86.} - - func mm512_min_ps*(a, b: m512): m512 {.importc: "_mm512_min_ps", x86.} - func mm512_max_ps*(a, b: m512): m512 {.importc: "_mm512_max_ps", x86.} - - func mm512_or_ps*(a, b: m512): m512 {.importc: "_mm512_or_ps", x86.} - - # ############################################################ - # - # AVX512 - float64 - packed - # - # ############################################################ - - func mm512_setzero_pd*(): m512d {.importc: "_mm512_setzero_pd", x86.} - func mm512_set1_pd*(a: float64): m512d {.importc: "_mm512_set1_pd", x86.} - func mm512_load_pd*(aligned_mem_addr: ptr float64): m512d {.importc: "_mm512_load_pd", x86.} - func mm512_loadu_pd*(mem_addr: ptr float64): m512d {.importc: "_mm512_loadu_pd", x86.} - func mm512_store_pd*(mem_addr: ptr float64, a: m512d) {.importc: "_mm512_store_pd", x86.} - func mm512_storeu_pd*(mem_addr: ptr float64, a: m512d) {.importc: "_mm512_storeu_pd", x86.} - func mm512_add_pd*(a, b: m512d): m512d {.importc: "_mm512_add_pd", x86.} - func mm512_mul_pd*(a, b: m512d): m512d {.importc: "_mm512_mul_pd", x86.} - func mm512_fmadd_pd*(a, b, c: m512d): m512d {.importc: "_mm512_fmadd_pd", x86.} - - # # ############################################################ - # # - # # AVX512 - integers - packed - # # - # # ############################################################ - - func mm512_setzero_si512*(): m512i {.importc: "_mm512_setzero_si512", x86.} - func mm512_set1_epi8*(a: int8 or uint8): m512i {.importc: "_mm512_set1_epi8", x86.} - func mm512_set1_epi16*(a: int16 or uint16): m512i {.importc: "_mm512_set1_epi16", x86.} - func mm512_set1_epi32*(a: int32 or uint32): m512i {.importc: "_mm512_set1_epi32", x86.} - func mm512_set1_epi64*(a: int64 or uint64): m512i {.importc: "_mm512_set1_epi64", x86.} - func mm512_load_si512*(mem_addr: ptr SomeInteger): m512i {.importc: "_mm512_load_si512", x86.} - func mm512_loadu_si512*(mem_addr: ptr SomeInteger): m512i {.importc: "_mm512_loadu_si512", x86.} - func mm512_storeu_si512*(mem_addr: ptr SomeInteger, a: m512i) {.importc: "_mm512_storeu_si512", x86.} - - func mm512_add_epi8*(a, b: m512i): m512i {.importc: "_mm512_add_epi8", x86.} - func mm512_add_epi16*(a, b: m512i): m512i {.importc: "_mm512_add_epi16", x86.} - func mm512_add_epi32*(a, b: m512i): m512i {.importc: "_mm512_add_epi32", x86.} - func mm512_add_epi64*(a, b: m512i): m512i {.importc: "_mm512_add_epi64", x86.} - - func mm512_mullo_epi32*(a, b: m512i): m512i {.importc: "_mm512_mullo_epi32", x86.} - ## Multiply element-wise 2 vectors of 16 32-bit ints - ## into intermediate 16 32-bit ints, and keep the low 32-bit parts - - func mm512_mullo_epi64*(a, b: m512i): m512i {.importc: "_mm512_mullo_epi64", x86.} - ## Multiply element-wise 2 vectors of 8x 64-bit ints - ## into intermediate 8x 64-bit ints, and keep the low 64-bit parts - - func mm512_and_si512*(a, b: m512i): m512i {.importc: "_mm512_and_si512", x86.} - ## Bitwise and - - func mm512_cmpgt_epi32_mask*(a, b: m512i): mmask16 {.importc: "_mm512_cmpgt_epi32_mask", x86.} - ## Compare a greater than b, returns a 16-bit mask - - func mm512_maskz_set1_epi32*(k: mmask16, a: cint): m512i {.importc: "_mm512_maskz_set1_epi32", x86.} - ## Compare a greater than b - ## Broadcast 32-bit integer a to all elements of dst using zeromask k - ## (elements are zeroed out when the corresponding mask bit is not set). - - func mm512_movm_epi32*(a: mmask16): m512i {.importc: "_mm512_movm_epi32", x86.} - - func mm512_movepi8_mask*(a: m512i): mmask64 {.importc: "_mm512_movepi8_mask", x86.} - ## Returns the most significant bit - ## of each 8-bit elements in `a` - - func mm512_srli_epi32*(a: m512i, count: int32): m512i {.importc: "_mm512_srli_epi32", x86.} - func mm512_slli_epi32*(a: m512i, count: int32): m512i {.importc: "_mm512_slli_epi32", x86.} - - func mm512_i32gather_epi32*(i: m512i, m: ptr (uint32 or int32), s: int32): m512i {.importc: "_mm512_i32gather_epi32", x86.} - ## Warning ⚠: Argument are switched compared to mm256_i32gather_epi32 - - func mm512_castps_si512*(a: m512): m512i {.importc: "_mm512_castps_si512", x86.} - ## Cast a float32x16 vectors into a 512-bit int vector with the same bit pattern - func mm512_castsi512_ps*(a: m512i): m512 {.importc: "_mm512_castsi512_ps", x86.} - ## Cast a 512-bit int vector into a float32x16 vector with the same bit pattern - func mm512_cvtps_epi32*(a: m512): m512i {.importc: "_mm512_cvtps_epi32", x86.} - ## Convert a float32x16 to int32x8 - func mm512_cvtepi32_ps*(a: m512i): m512 {.importc: "_mm512_cvtepi32_ps", x86.} - ## Convert a int32x8 to float32x16 - - func cvtmask64_u64*(a: mmask64): uint64 {.importc: "_cvtmask64_u64", x86.} diff --git a/benchmarks/matmul/nim.cfg b/benchmarks/matmul/nim.cfg deleted file mode 100644 index a2326e6..0000000 --- a/benchmarks/matmul/nim.cfg +++ /dev/null @@ -1,13 +0,0 @@ -# ############################################################ -# -# SIMD flags -# -# ############################################################ - -gemm_ukernel_sse.always = "-msse" -gemm_ukernel_sse2.always = "-msse2" -gemm_ukernel_sse4_1.always = "-msse4.1" -gemm_ukernel_avx.always = "-mavx" -gemm_ukernel_avx_fma.always = "-mavx -mfma" -gemm_ukernel_avx2.always = "-mavx2" -gemm_ukernel_avx512.always = "-mavx512f -mavx512dq" diff --git a/benchmarks/matmul/weave_gemm/README.md b/benchmarks/matmul/weave_gemm/README.md deleted file mode 100644 index 1333ed7..0000000 --- a/benchmarks/matmul/weave_gemm/README.md +++ /dev/null @@ -1 +0,0 @@ -TODO