diff --git a/exts/build_dbcsr/Makefile b/exts/build_dbcsr/Makefile index f218638f7b..e09ef9eac5 100644 --- a/exts/build_dbcsr/Makefile +++ b/exts/build_dbcsr/Makefile @@ -310,16 +310,19 @@ ifeq (Darwin,$(shell uname)) else # OpenCL include directory (cl.h not installed per "opencl-headers" package) NVCC := $(shell which nvcc 2>/dev/null) - NVCC_PATH := $(if $(NVCC),$(wildcard $(dir $(NVCC))/..)) + NVCC_PATH := $(if $(NVCC),$(realpath $(dir $(NVCC))/..)) CUDA_FILE := $(wildcard $(NVCC_PATH)/../cuda/include/cuda.h) - CUDA_PATH := $(if $(CUDA_FILE),$(NVCC_PATH)/../cuda,$(NVCC_PATH)) + CUDA_PATH := $(if $(CUDA_FILE),$(abspath $(NVCC_PATH)/../cuda),$(NVCC_PATH)) CUDA_LIBS := $(if $(wildcard $(CUDA_PATH)/lib64),lib64,lib) ifneq (,$(CUDA_PATH)) CFLAGS += -I$(CUDA_PATH)/include else ifneq (,$(wildcard $(OPENCL_ROOT)/include/CL/cl.h)) CFLAGS += -I$(OPENCL_ROOT)/include - else ifneq (,$(wildcard $(OPENCL_ROOT)/include/sycl/CL/cl.h)) - CFLAGS += -I$(OPENCL_ROOT)/include/sycl + else + OPENCL_ROOT := $(abspath $(dir $(shell which icx 2>/dev/null))/..) + ifneq (,$(wildcard $(OPENCL_ROOT)/include/sycl/CL/cl.h)) + CFLAGS += -I$(OPENCL_ROOT)/include/sycl + endif endif endif endif diff --git a/src/dbm/dbm_multiply_opencl.c b/src/dbm/dbm_multiply_opencl.c index 23f7140336..9b18469f13 100644 --- a/src/dbm/dbm_multiply_opencl.c +++ b/src/dbm/dbm_multiply_opencl.c @@ -11,15 +11,26 @@ #include "dbm_multiply_gpu_kernel.h" #include "dbm_multiply_opencl.cl.h" +#if !defined(OPENCL_DBM_TLS) && 1 +#define OPENCL_DBM_TLS LIBXSMM_TLS +#endif + void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, const int mnk_range[3][2], double alpha, int ntasks, const dbm_task_t *tasks, const double *pack_a_data, const double *pack_b_data, double *shard_c_data) { - static cl_kernel kernel = NULL; - static int ndims = 1, split = 0; + /* creating/calling kernel must be consistent across threads */ + static cl_kernel kernel_global = NULL; +#if defined(OPENCL_DBM_TLS) + static OPENCL_DBM_TLS cl_kernel kernel = NULL; +#else + cl_kernel kernel = NULL; +#endif + static int ndims = 1; static size_t wgsize[] = {0, 0, 0}; + const libxsmm_timer_tickint start = libxsmm_timer_tick(); int result = EXIT_SUCCESS, verbosity = c_dbcsr_acc_opencl_config.verbosity; cl_event event, *const perf_event = ((0 <= verbosity && 2 >= verbosity) ? NULL : &event); @@ -37,96 +48,91 @@ void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, mnk_range[2][0] <= mnk_range[2][1]); assert(NULL != str && NULL != str->queue); assert(0 < ntasks && NULL != tasks); - /* creating/calling kernel must be consistent across threads */ - ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_main); #if defined(OPENCL_DBM_SOURCE_MULTIPLY) - if (NULL == kernel) { /* first-time check if kernel is present */ - const libxsmm_timer_tickint start = libxsmm_timer_tick(); - char params[ACC_OPENCL_BUFFERSIZE] = - "-cl-fast-relaxed-math -cl-denorms-are-zero"; - const char *const gen_env = getenv("DBM_MULTIPLY_GEN"); - const char *const xf_env = getenv("DBM_MULTIPLY_XF"); - const char *const lu_env = getenv("DBM_MULTIPLY_LU"); - const char *const bn_env = getenv("DBM_MULTIPLY_BN"); - const int gpu = - (CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type); - const int gen = (NULL == gen_env ? 0 /*default*/ : atoi(gen_env)); - const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env)); - const int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1); - int bn = (NULL == bn_env ? 8 : atoi(bn_env)); - const char *extensions[] = {NULL, NULL}, *flags = NULL; - size_t nextensions = sizeof(extensions) / sizeof(*extensions); - const size_t wgsize0 = c_dbcsr_acc_opencl_config.device.wgsize[0]; - const size_t wgsize1 = c_dbcsr_acc_opencl_config.device.wgsize[1]; - size_t wgsize2 = c_dbcsr_acc_opencl_config.device.wgsize[2]; - size_t offset = (0 == c_dbcsr_acc_opencl_config.debug ? strlen(params) : 0); - offset += (size_t)c_dbcsr_acc_opencl_flags_atomics( - &c_dbcsr_acc_opencl_config.device, c_dbcsr_acc_opencl_atomic_fp_64, - extensions, &nextensions, params + offset, sizeof(params) - offset); - if (2 <= gen || (0 != gen && 0 != wgsize2 /*subgroups*/ && - 2 <= *c_dbcsr_acc_opencl_config.device.std_level && - NULL != extensions[1] && - NULL != strstr(extensions[1], "cl_ext_float_atomics"))) { - offset += - (size_t)LIBXSMM_SNPRINTF(params + offset, sizeof(params) - offset, - " -DDBM_MULTIPLY_OPENCL_GEN"); - wgsize[1] = wgsize[2] = 1; - wgsize[0] = 16; - ndims = 3; - } else { - const char *const split_env = getenv("DBM_MULTIPLY_SPLIT"); - const char *const wg_env = getenv("DBM_MULTIPLY_WG"); - split = (NULL == split_env ? 1 /*default*/ : atoi(split_env)); - wgsize[0] = - (NULL == wg_env ? (1 != split ? (wgsize1 * LIBXSMM_ABS(split)) : 0) - : strtoul(wg_env, NULL, 10)); - if (0 != split && 1 != split && (bn * bn) > (int)wgsize[0]) { - wgsize[0] = bn * bn; - } - if (0 != split && 0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */ - if (LIBXSMM_DELTA(wgsize[0], wgsize1) <= - LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */ - wgsize2 = wgsize1; - } - wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2); + if (NULL == kernel_global) { /* initial check if kernel is present */ + ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_main); + if (NULL == kernel_global) { + char params[ACC_OPENCL_BUFFERSIZE] = + "-cl-fast-relaxed-math -cl-denorms-are-zero"; + const char *const gen_env = getenv("DBM_MULTIPLY_GEN"); + const char *const xf_env = getenv("DBM_MULTIPLY_XF"); + const char *const lu_env = getenv("DBM_MULTIPLY_LU"); + const char *const bn_env = getenv("DBM_MULTIPLY_BN"); + const int gpu = + (CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type); + const int gen = (NULL == gen_env ? 1 /*default*/ : atoi(gen_env)); + const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env)); + const int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1); + const int bn = LIBXSMM_CLMP(NULL == bn_env ? 8 : atoi(bn_env), 4, 32); + const char *extensions[] = {NULL, NULL}, *flags = NULL; + size_t nextensions = sizeof(extensions) / sizeof(*extensions); + const size_t wgsize0 = c_dbcsr_acc_opencl_config.device.wgsize[0]; + const size_t wgsize1 = c_dbcsr_acc_opencl_config.device.wgsize[1]; + size_t wgsize2 = c_dbcsr_acc_opencl_config.device.wgsize[2]; + size_t offset = + (0 == c_dbcsr_acc_opencl_config.debug ? strlen(params) : 0); + offset += (size_t)c_dbcsr_acc_opencl_flags_atomics( + &c_dbcsr_acc_opencl_config.device, c_dbcsr_acc_opencl_atomic_fp_64, + extensions, &nextensions, params + offset, sizeof(params) - offset); + if (2 <= gen || (0 != gen && 0 != wgsize2 /*subgroups*/ && + 2 <= *c_dbcsr_acc_opencl_config.device.std_level && + NULL != extensions[1] && + NULL != strstr(extensions[1], "cl_ext_float_atomics"))) { + offset += + (size_t)LIBXSMM_SNPRINTF(params + offset, sizeof(params) - offset, + " -DDBM_MULTIPLY_OPENCL_GEN"); + wgsize[1] = wgsize[2] = 1; + wgsize[0] = 16; + ndims = 3; } else { - wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1); - wgsize2 = 0; + const char *const wg_env = getenv("DBM_MULTIPLY_WG"); + wgsize[0] = (NULL == wg_env ? 0 : strtoul(wg_env, NULL, 10)); + if (0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */ + if (LIBXSMM_DELTA(wgsize[0], wgsize1) <= + LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */ + wgsize2 = wgsize1; + } + wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2); + } else { + wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1); + wgsize2 = 0; + } + wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0); + offset += (size_t)LIBXSMM_SNPRINTF( + params + offset, sizeof(params) - offset, + " %s -DBN=%i -DWG=%i -DSG=%i -DLU=%i", 0 != gpu ? "-DGPU" : "", bn, + (int)wgsize[0], (int)wgsize2, lu); } - wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0); - if (NULL == bn_env && 0 != split && 1 != split && - (bn * bn) < (int)wgsize[0]) { - bn = libxsmm_isqrt2_u32(wgsize[0]); + if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 < xf) { + flags = "-cl-intel-256-GRF-per-thread"; } - bn = LIBXSMM_CLMP(bn, 4, 32); - offset += (size_t)LIBXSMM_SNPRINTF( - params + offset, sizeof(params) - offset, - " %s -DSPLIT=%i -DBN=%i -DWG=%i -DSG=%i -DLU=%i", - 0 != gpu ? "-DGPU" : "", split, bn, (int)wgsize[0], (int)wgsize2, lu); - } - if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 < xf) { - flags = "-cl-intel-256-GRF-per-thread"; - } - result |= (sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE); - result |= c_dbcsr_acc_opencl_kernel( - 0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply", - params, flags, NULL /*try*/, NULL /*try_ok*/, extensions, nextensions, - &kernel); - if (2 <= verbosity || 0 > verbosity) { - if (EXIT_SUCCESS == result) { - const double d = libxsmm_timer_duration(start, libxsmm_timer_tick()); - fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu); - if (0 == gen) { - fprintf(stderr, " split=%i lu=%i bn=%i", split, lu, bn); - } else { /* generated kernel */ - fprintf(stderr, " gen=%i", gen); + result |= (sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE); + result |= c_dbcsr_acc_opencl_kernel( + 0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply", + params, flags, NULL /*try*/, NULL /*try_ok*/, extensions, nextensions, + &kernel_global); + if (2 <= verbosity || 0 > verbosity) { + if (EXIT_SUCCESS == result) { + const double ds = libxsmm_timer_duration(start, libxsmm_timer_tick()); + fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu); + if (0 == gen) { + fprintf(stderr, " lu=%i bn=%i", lu, bn); + } else { /* generated kernel */ + fprintf(stderr, " gen=%i", gen); + } + fprintf(stderr, " wg=%i sg=%i ms=%.1f\n", (int)wgsize[0], + (int)wgsize2, 1E3 * ds); + } else { + fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n"); } - fprintf(stderr, " wg=%i sg=%i ms=%.1f\n", (int)wgsize[0], (int)wgsize2, - 1E3 * d); - } else { - fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n"); } + kernel = kernel_global; + } else { + kernel = clCloneKernel(kernel_global, &result); } + ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_main); + } else if (NULL == kernel) { + kernel = clCloneKernel(kernel_global, &result); } #else #error "OpenCL kernel code not found!" @@ -146,7 +152,7 @@ void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, assert(0 == iadata && 0 == ibdata && 0 == icdata); result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha); result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch); - if (1 < ndims) { /* generated kernel */ + if (1 < ndims) { /* DBM_MULTIPLY_GEN */ const cl_uint zero = 0; assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]); work_size[0] = 16; @@ -162,20 +168,10 @@ void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, result |= clSetKernelArg(kernel, 9, sizeof(cl_uint), &zero /*C_shape0*/); } else { result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks); - if (0 != split) { - if (1 == split || 0 == wgsize[0]) { - work_size[0] = work_tasks * max_m; - result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size); - if (0 < wgsize[0]) { /* fixup to be a multiple of the WG-size */ - work_size[0] = LIBXSMM_UP(work_size[0], wgsize[0]); - } - } else { - work_size[0] = work_tasks * wgsize[0]; - result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size); - } - } else { - work_size[0] = work_tasks; - result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size); + work_size[0] = work_tasks * max_m; + result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size); + if (0 < wgsize[0]) { /* fixup to be a multiple of the WG-size */ + work_size[0] = LIBXSMM_UP(work_size[0], wgsize[0]); } result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory); result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory); @@ -185,25 +181,26 @@ void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, result |= clEnqueueNDRangeKernel( str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL, 0 /*num_wait*/, NULL /*wait_list*/, perf_event); - if (NULL != perf_event && EXIT_SUCCESS == result) { + if (NULL != perf_event && EXIT_SUCCESS == result && + EXIT_SUCCESS == clWaitForEvents(1, perf_event)) { + const double dhost = libxsmm_timer_duration(start, libxsmm_timer_tick()); cl_ulong begin = 0, end = 0; - clWaitForEvents(1, perf_event); - result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START, - sizeof(cl_ulong), &begin, NULL); - result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &end, NULL); - if (EXIT_SUCCESS == result) { - const double duration_ns = LIBXSMM_DELTA(begin, end); - const double gflops = - (max_m * mnk_range[1][1] * mnk_range[2][1] * ntasks) / duration_ns; + if (EXIT_SUCCESS == + clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &begin, NULL) && + EXIT_SUCCESS == clGetEventProfilingInfo(*perf_event, + CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &end, NULL)) { + const size_t flops = max_m * mnk_range[1][1] * mnk_range[2][1] * ntasks; + const double dkrnl = 1E-9 * LIBXSMM_DELTA(begin, end); + const double dtotl = 1E+3 * LIBXSMM_MAX(dkrnl, dhost); fprintf(stderr, - "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i " - "ntasks=%i gflops=%.1f ms=%.2g\n", - mnk_range[0][1], mnk_range[1][1], mnk_range[2][1], ntasks, gflops, - 1E-6 * duration_ns); + "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i ntasks=%i " + "kernel_ms=%.2g total_ms=%.2g gflops=%.1f\n", + mnk_range[0][1], mnk_range[1][1], mnk_range[2][1], ntasks, dkrnl, + dtotl, 1E-6 * flops / dtotl); } } - ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_main); OFFLOAD_CHECK(result); } diff --git a/src/dbm/dbm_multiply_opencl.cl b/src/dbm/dbm_multiply_opencl.cl index 86037a544b..4186badde3 100644 --- a/src/dbm/dbm_multiply_opencl.cl +++ b/src/dbm/dbm_multiply_opencl.cl @@ -5,7 +5,7 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /*----------------------------------------------------------------------------*/ #if defined(DBM_MULTIPLY_OPENCL_GEN) -#include "dbm_multiply_opencl.ir.h" +#include "dbm_multiply_opencl.irh" #else #include "../../exts/dbcsr/src/acc/opencl/common/opencl_atomics.h" #include "dbm_multiply_internal.h" @@ -29,91 +29,38 @@ #define XN(T) (SINT) X(T, n) #define XK(T) (SINT) X(T, k) -#define DBM_MULTIPLY_SHM(ALPHA, TASK, AMAT, BMAT, CMAT, SHM, WG, BM, BN) \ - do { /* matrix multiplication per work-group using shared memory */ \ - local double *restrict const ashm = (SHM); \ - local double *restrict const bshm = (SHM) + (WG); \ - const int mk = XM(TASK) * XK(TASK), kn = XK(TASK) * XN(TASK); \ - const SINT tid = (SINT)get_local_id(0); \ - /* y/s can exceed BN/BM (up to BK), and x/t is fast index (up to BM/BN) */ \ - const SINT y = tid / (BM), x = tid - y * (BM), bk = (WG) / MAX(BM, BN); \ - const SINT s = tid / (BN), t = tid - s * (BN); \ - for (SINT m0 = 0; m0 < XM(TASK); m0 += (BM)) { \ - for (SINT n0 = 0; n0 < XN(TASK); n0 += (BN)) { \ - double r = ZERO; \ - UNROLL_AUTO for (SINT k0 = 0; k0 < XK(TASK); k0 += bk) { \ - if (x < (BM) && y < bk) { /* load A-tile */ \ - const int idx = IDT(m0 + x, k0 + y, XM(TASK), XK(TASK)); \ - ashm[y * (BM) + x] = (idx < mk ? (AMAT)[XA(TASK) + idx] : ZERO); \ - } \ - if (s < bk && t < (BN)) { /* load B-tile */ \ - const int idx = IDX(k0 + s, n0 + t, XK(TASK), XN(TASK)); \ - bshm[s * (BN) + t] = (idx < kn ? (BMAT)[XB(TASK) + idx] : ZERO); \ - } \ - BARRIER(CLK_LOCAL_MEM_FENCE); \ - if (x < (BM) && y < (BN)) { /* multiply tiles */ \ - UNROLL_AUTO for (SINT z = 0; z < bk; ++z) { \ - r = MAD(ashm[z * (BM) + x], bshm[z * (BN) + y], r); \ - } \ - } \ - BARRIER(CLK_LOCAL_MEM_FENCE); \ - } \ - if (x < (BM) && y < (BN)) { /* flush to global */ \ - const SINT m = m0 + x, n = n0 + y; \ - if (m < XM(TASK) && n < XN(TASK)) { \ - const int idx = IDT(m, n, XM(TASK), XN(TASK)); \ - ACCUMULATE((CMAT) + XC(TASK) + idx, (ALPHA) * r); \ - } \ - } \ - } \ +#define DBM_MULTIPLY_STORE(ALPHA, TASK, CMAT, CVEC, M, N0, N1) \ + do { /* CMAT atomically accumulates CVEC */ \ + UNROLL_AUTO for (SINT n = 0; n < (N1); ++n) { /* flush to global */ \ + const int idx = IDT(M, n + (N0), XM(TASK), XN(TASK)); \ + ACCUMULATE((CMAT) + XC(TASK) + idx, (ALPHA) * (CVEC)[n]); \ + (CVEC)[n] = ZERO; /* reset */ \ } \ } while (0) -#define DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, CVEC, M, N0, N1, K, \ - BCST) \ - UNROLL_AUTO for (SINT k = 0; k < (K); ++k) { \ - const double a = (AMAT)[XA(TASK) + IDT(M, k, XM(TASK), K)]; \ - UNROLL_AUTO for (SINT n = 0; n < (N1); ++n) { \ - const double b = (BMAT)[XB(TASK) + IDX(k, n + (N0), K, XN(TASK))]; \ - (CVEC)[n] = MAD(a, BCST(b), (CVEC)[n]); \ +#define DBM_MULTIPLY_KERNEL(TASK, AMAT, BMAT, CVEC, M, N0, N1, BCST) \ + do { /* CVEC accumulates result */ \ + UNROLL_AUTO for (SINT k = 0; k < XK(TASK); ++k) { \ + const double a = (AMAT)[XA(TASK) + IDT(M, k, XM(TASK), XK(TASK))]; \ + UNROLL_AUTO for (SINT n = 0; n < (N1); ++n) { \ + const int idx = IDX(k, n + (N0), XK(TASK), XN(TASK)); \ + (CVEC)[n] = MAD(a, BCST((BMAT)[idx]), (CVEC)[n]); \ + } \ } \ - } \ - UNROLL_AUTO for (SINT n = 0; n < (N1); ++n) { /* flush to global */ \ - const int idx = IDT(M, n + (N0), XM(TASK), XN(TASK)); \ - ACCUMULATE((CMAT) + XC(TASK) + idx, (ALPHA) * (CVEC)[n]); \ - (CVEC)[n] = ZERO; /* reset */ \ - } + } while (0) -#define DBM_MULTIPLY(ALPHA, TASK, AMAT, BMAT, CMAT, M, BN, BCST) \ - do { /* DBM_MULTIPLY_KERNEL unrolled/specialized over N and K */ \ - double cvec[BN]; \ +#define DBM_MULTIPLY(ALPHA, TASK, AMAT, BMAT, CMAT, CVEC, M, BN, BCST) \ + do { /* DBM_MULTIPLY_KERNEL specialized over N */ \ SINT n0 = 0; \ - UNROLL_AUTO for (SINT n = 0; n < (BN); ++n) { cvec[n] = ZERO; } \ + UNROLL_AUTO for (SINT n = 0; n < (BN); ++n) { (CVEC)[n] = ZERO; } \ if ((BN) <= XN(TASK)) { \ - if (1 < XK(TASK)) { \ - UNROLL_OUTER(1) for (; (n0 + (BN)) <= XN(TASK); n0 += (BN)) { \ - DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, cvec, M, n0, BN, \ - XK(TASK), BCST); \ - } \ - } else { /* K = 1 */ \ - UNROLL_OUTER(1) for (; (n0 + (BN)) <= XN(TASK); n0 += (BN)) { \ - DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, cvec, M, n0, BN, \ - 1, BCST); \ - } \ + UNROLL_OUTER(1) for (; (n0 + (BN)) <= XN(TASK); n0 += (BN)) { \ + DBM_MULTIPLY_KERNEL(TASK, AMAT, BMAT, CVEC, M, n0, BN, BCST); \ + DBM_MULTIPLY_STORE(ALPHA, TASK, CMAT, CVEC, M, n0, BN); \ } \ - } else if (1 != XK(TASK)) { /* N < BN */ \ - DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, cvec, M, 0, 1, \ - XK(TASK), BCST); \ - n0 = 1; \ - } else { /* N < BN, K = 1 */ \ - DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, cvec, M, 0, 1, 1, \ - BCST); \ - n0 = 1; \ - } \ - /*if (n0 < XN(TASK))*/ { /* handle remainder */ \ - DBM_MULTIPLY_KERNEL(ALPHA, TASK, AMAT, BMAT, CMAT, cvec, M, n0, \ - XN(TASK) - n0, XK(TASK), BCST); \ } \ + DBM_MULTIPLY_KERNEL(TASK, AMAT, BMAT, CVEC, M, n0, XN(TASK) - n0, BCST); \ + DBM_MULTIPLY_STORE(ALPHA, TASK, CMAT, CVEC, M, n0, XN(TASK) - n0); \ } while (0) #if defined(WG) && (0 < WG) @@ -126,81 +73,26 @@ kernel void dbm_multiply(double alpha, int itask, int ntasks, int size, global const dbm_task_t *tasks, global const double *restrict amat, global const double *restrict bmat, global double *restrict cmat) { -#if defined(SPLIT) && (1 < SPLIT) && defined(WG) && (0 < WG) - local double shm[WG * 2]; - global const dbm_task_t *const task = &tasks[itask + get_group_id(0)]; - const SINT rmin = MIN(XM(task), XN(task)), rmax = MAX(XM(task), XN(task)); - if ((rmax - rmin) <= BN) { - if ((rmin * 4) < BN) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN / 4, BN / 4); - } else if ((rmin * 2) < BN) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN / 2, BN / 2); - } else { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN, BN); - } - } else if (XM(task) <= XN(task)) { - const SINT r1 = BLR(XM(task), BN); - const SINT r2 = BLR(XM(task), BN / 2) * 2; - const SINT r3 = BLR(XM(task), BN / 4) * 4; - if (r1 <= r2) { - if (r1 <= r3) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN, BN); - } else { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN / 4, - BN * 4); - } - } else if (r2 <= r3) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN / 2, BN * 2); - } else { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN / 4, BN * 4); - } - } else { - const SINT r1 = BLR(XN(task), BN); - const SINT r2 = BLR(XN(task), BN / 2) * 2; - const SINT r3 = BLR(XN(task), BN / 4) * 4; - if (r1 <= r2) { - if (r1 <= r3) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN, BN); - } else { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN * 4, - BN / 4); - } - } else if (r2 <= r3) { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN * 2, BN / 2); - } else { - DBM_MULTIPLY_SHM(alpha, task, amat, bmat, cmat, shm, WG, BN * 4, BN / 4); - } - } -#elif defined(SPLIT) && (0 != SPLIT) const int i = (int)get_global_id(0); #if defined(BCST_WG) if (i < size) #endif - { /* DBM_MULTIPLY_SPLIT */ + { const int max_m = size / ntasks, tid = i / max_m; const SINT m = i - tid * max_m; global const dbm_task_t *const task = &tasks[itask + tid]; if (m < XM(task)) { /* valid task */ + double cvec[BN]; + bmat += XB(task); #if defined(BCST_WG) if (XM(task) <= XN(task)) { /* BCST_WG to broadcast B-values */ - DBM_MULTIPLY(alpha, task, amat, bmat, cmat, m, BN, BCST_WG); + DBM_MULTIPLY(alpha, task, amat, bmat, cmat, cvec, m, BN, BCST_WG); } else #endif { - DBM_MULTIPLY(alpha, task, amat, bmat, cmat, m, BN, BCST_NO); + DBM_MULTIPLY(alpha, task, amat, bmat, cmat, cvec, m, BN, BCST_NO); } } } -#else -#if defined(BCST_WG) - if (get_global_id(0) < size) -#endif - { /* full matrix multiplication per work-item (thread) */ - global const dbm_task_t *const task = &tasks[itask + get_global_id(0)]; - UNROLL_OUTER(1) for (SINT m = 0; m < XM(task); ++m) { - DBM_MULTIPLY(alpha, task, amat, bmat, cmat, m, BN, BCST_NO); - } - } -#endif } #endif diff --git a/src/dbm/dbm_multiply_opencl.ir b/src/dbm/dbm_multiply_opencl.ir new file mode 100644 index 0000000000..84521a0928 --- /dev/null +++ b/src/dbm/dbm_multiply_opencl.ir @@ -0,0 +1,42 @@ +func @dbm_multiply(%alpha: f64, %itask: i32, + %tasks: memref, + %A: memref, + %B: memref, + %C: memref) + work_group_size(16, 1) { + %gid = group_id + %itask_idx = cast %itask : i32 -> index + ; Compute task id + %tid = arith.add %itask_idx, %gid : index + + ; Load task struct + %iM = load %tasks[0, %tid] : memref + %iN = load %tasks[1, %tid] : memref + %iK = load %tasks[2, %tid] : memref + %ioffset_a = load %tasks[3, %tid] : memref + %ioffset_b = load %tasks[4, %tid] : memref + %ioffset_c = load %tasks[5, %tid] : memref + + ; Cast task struct to index type + %M = cast %iM : i32 -> index + %N = cast %iN : i32 -> index + %K = cast %iK : i32 -> index + %offset_a = cast %ioffset_a : i32 -> index + %offset_b = cast %ioffset_b : i32 -> index + %offset_c = cast %ioffset_c : i32 -> index + + ; Get view on small matrices + %MK = arith.mul %M, %K : index + %KN = arith.mul %K, %N : index + %MN = arith.mul %M, %N : index + %av = subview %A[%offset_a:%MK] : memref + %bv = subview %B[%offset_b:%KN] : memref + %cv = subview %C[%offset_c:%MN] : memref + %a = expand %av[0->%M x %K] : memref + %b = expand %bv[0->%N x %K] : memref + %c = expand %cv[0->%M x %N] : memref + + ; GEMM + gemm.n.t.atomic %alpha, %a, %b, 1.0, %c + : f64, memref, memref, f64, memref +} diff --git a/src/dbm/dbm_multiply_opencl.irh b/src/dbm/dbm_multiply_opencl.irh new file mode 100644 index 0000000000..b55482cd0e --- /dev/null +++ b/src/dbm/dbm_multiply_opencl.irh @@ -0,0 +1,1033 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +void gemm_atomic_f64f64f64f64f64_An_Bt_Md_Nd_Kd_Astride1_d_Bstride1_d_Cstride1_d_alphad_beta3ff0000000000000( + long M, long N, long K, double alpha, global double *A, long A_stride, + long A_stride1, global double *B, long B_stride, long B_stride1, + double beta, global double *C, long C_stride, long C_stride1) { + uint m = get_sub_group_local_id(); + double c[16]; + uint sg_n = get_sub_group_id(); + uint blocks = 1 + (N - 1) / 16u; + blocks = 1 + (blocks - 1); + uint bs = N / blocks; + uint bs_1 = bs + 1; + uint rem = N % blocks; + uint blck; + __attribute__((opencl_unroll_hint(1))) for (blck = bs_1 * sg_n; + blck < bs_1 * rem; blck += bs_1) { + global double *Bb = B + blck; + uint sg_m = 0; + uint blocks1 = M / 16u; + uint rem1 = M % 16u; + uint blck1; + __attribute__((opencl_unroll_hint(1))) for (blck1 = 16u * sg_m; + blck1 < 16u * blocks1; + blck1 += 16u) { + global double *Ab = A + blck1; + global double *Ab1 = Ab; + global double *Bb1 = Bb; + c[0] = 0x0p+0; + c[1] = 0x0p+0; + c[2] = 0x0p+0; + c[3] = 0x0p+0; + c[4] = 0x0p+0; + c[5] = 0x0p+0; + c[6] = 0x0p+0; + c[7] = 0x0p+0; + c[8] = 0x0p+0; + c[9] = 0x0p+0; + c[10] = 0x0p+0; + c[11] = 0x0p+0; + c[12] = 0x0p+0; + c[13] = 0x0p+0; + c[14] = 0x0p+0; + c[15] = 0x0p+0; + uint KmultipleKb = K / 8 * 8; + __attribute__((opencl_unroll_hint(1))) for (short kb = 0; + kb < KmultipleKb; kb += 8) { + double a[8]; + a[0] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[1] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[2] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[3] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[4] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[5] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[6] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[7] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + double b[8]; + b[0] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[1] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[2] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[3] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[4] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[5] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[6] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[7] = + get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[0] = fma(a[1], sub_group_broadcast(b[1], 0), c[0]); + c[1] = fma(a[1], sub_group_broadcast(b[1], 1), c[1]); + c[2] = fma(a[1], sub_group_broadcast(b[1], 2), c[2]); + c[3] = fma(a[1], sub_group_broadcast(b[1], 3), c[3]); + c[0] = fma(a[2], sub_group_broadcast(b[2], 0), c[0]); + c[1] = fma(a[2], sub_group_broadcast(b[2], 1), c[1]); + c[2] = fma(a[2], sub_group_broadcast(b[2], 2), c[2]); + c[3] = fma(a[2], sub_group_broadcast(b[2], 3), c[3]); + c[0] = fma(a[3], sub_group_broadcast(b[3], 0), c[0]); + c[1] = fma(a[3], sub_group_broadcast(b[3], 1), c[1]); + c[2] = fma(a[3], sub_group_broadcast(b[3], 2), c[2]); + c[3] = fma(a[3], sub_group_broadcast(b[3], 3), c[3]); + c[0] = fma(a[4], sub_group_broadcast(b[4], 0), c[0]); + c[1] = fma(a[4], sub_group_broadcast(b[4], 1), c[1]); + c[2] = fma(a[4], sub_group_broadcast(b[4], 2), c[2]); + c[3] = fma(a[4], sub_group_broadcast(b[4], 3), c[3]); + c[0] = fma(a[5], sub_group_broadcast(b[5], 0), c[0]); + c[1] = fma(a[5], sub_group_broadcast(b[5], 1), c[1]); + c[2] = fma(a[5], sub_group_broadcast(b[5], 2), c[2]); + c[3] = fma(a[5], sub_group_broadcast(b[5], 3), c[3]); + c[0] = fma(a[6], sub_group_broadcast(b[6], 0), c[0]); + c[1] = fma(a[6], sub_group_broadcast(b[6], 1), c[1]); + c[2] = fma(a[6], sub_group_broadcast(b[6], 2), c[2]); + c[3] = fma(a[6], sub_group_broadcast(b[6], 3), c[3]); + c[0] = fma(a[7], sub_group_broadcast(b[7], 0), c[0]); + c[1] = fma(a[7], sub_group_broadcast(b[7], 1), c[1]); + c[2] = fma(a[7], sub_group_broadcast(b[7], 2), c[2]); + c[3] = fma(a[7], sub_group_broadcast(b[7], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[4] = fma(a[1], sub_group_broadcast(b[1], 4), c[4]); + c[5] = fma(a[1], sub_group_broadcast(b[1], 5), c[5]); + c[6] = fma(a[1], sub_group_broadcast(b[1], 6), c[6]); + c[7] = fma(a[1], sub_group_broadcast(b[1], 7), c[7]); + c[4] = fma(a[2], sub_group_broadcast(b[2], 4), c[4]); + c[5] = fma(a[2], sub_group_broadcast(b[2], 5), c[5]); + c[6] = fma(a[2], sub_group_broadcast(b[2], 6), c[6]); + c[7] = fma(a[2], sub_group_broadcast(b[2], 7), c[7]); + c[4] = fma(a[3], sub_group_broadcast(b[3], 4), c[4]); + c[5] = fma(a[3], sub_group_broadcast(b[3], 5), c[5]); + c[6] = fma(a[3], sub_group_broadcast(b[3], 6), c[6]); + c[7] = fma(a[3], sub_group_broadcast(b[3], 7), c[7]); + c[4] = fma(a[4], sub_group_broadcast(b[4], 4), c[4]); + c[5] = fma(a[4], sub_group_broadcast(b[4], 5), c[5]); + c[6] = fma(a[4], sub_group_broadcast(b[4], 6), c[6]); + c[7] = fma(a[4], sub_group_broadcast(b[4], 7), c[7]); + c[4] = fma(a[5], sub_group_broadcast(b[5], 4), c[4]); + c[5] = fma(a[5], sub_group_broadcast(b[5], 5), c[5]); + c[6] = fma(a[5], sub_group_broadcast(b[5], 6), c[6]); + c[7] = fma(a[5], sub_group_broadcast(b[5], 7), c[7]); + c[4] = fma(a[6], sub_group_broadcast(b[6], 4), c[4]); + c[5] = fma(a[6], sub_group_broadcast(b[6], 5), c[5]); + c[6] = fma(a[6], sub_group_broadcast(b[6], 6), c[6]); + c[7] = fma(a[6], sub_group_broadcast(b[6], 7), c[7]); + c[4] = fma(a[7], sub_group_broadcast(b[7], 4), c[4]); + c[5] = fma(a[7], sub_group_broadcast(b[7], 5), c[5]); + c[6] = fma(a[7], sub_group_broadcast(b[7], 6), c[6]); + c[7] = fma(a[7], sub_group_broadcast(b[7], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[8] = fma(a[1], sub_group_broadcast(b[1], 8), c[8]); + c[9] = fma(a[1], sub_group_broadcast(b[1], 9), c[9]); + c[10] = fma(a[1], sub_group_broadcast(b[1], 10), c[10]); + c[11] = fma(a[1], sub_group_broadcast(b[1], 11), c[11]); + c[8] = fma(a[2], sub_group_broadcast(b[2], 8), c[8]); + c[9] = fma(a[2], sub_group_broadcast(b[2], 9), c[9]); + c[10] = fma(a[2], sub_group_broadcast(b[2], 10), c[10]); + c[11] = fma(a[2], sub_group_broadcast(b[2], 11), c[11]); + c[8] = fma(a[3], sub_group_broadcast(b[3], 8), c[8]); + c[9] = fma(a[3], sub_group_broadcast(b[3], 9), c[9]); + c[10] = fma(a[3], sub_group_broadcast(b[3], 10), c[10]); + c[11] = fma(a[3], sub_group_broadcast(b[3], 11), c[11]); + c[8] = fma(a[4], sub_group_broadcast(b[4], 8), c[8]); + c[9] = fma(a[4], sub_group_broadcast(b[4], 9), c[9]); + c[10] = fma(a[4], sub_group_broadcast(b[4], 10), c[10]); + c[11] = fma(a[4], sub_group_broadcast(b[4], 11), c[11]); + c[8] = fma(a[5], sub_group_broadcast(b[5], 8), c[8]); + c[9] = fma(a[5], sub_group_broadcast(b[5], 9), c[9]); + c[10] = fma(a[5], sub_group_broadcast(b[5], 10), c[10]); + c[11] = fma(a[5], sub_group_broadcast(b[5], 11), c[11]); + c[8] = fma(a[6], sub_group_broadcast(b[6], 8), c[8]); + c[9] = fma(a[6], sub_group_broadcast(b[6], 9), c[9]); + c[10] = fma(a[6], sub_group_broadcast(b[6], 10), c[10]); + c[11] = fma(a[6], sub_group_broadcast(b[6], 11), c[11]); + c[8] = fma(a[7], sub_group_broadcast(b[7], 8), c[8]); + c[9] = fma(a[7], sub_group_broadcast(b[7], 9), c[9]); + c[10] = fma(a[7], sub_group_broadcast(b[7], 10), c[10]); + c[11] = fma(a[7], sub_group_broadcast(b[7], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + c[12] = fma(a[1], sub_group_broadcast(b[1], 12), c[12]); + c[13] = fma(a[1], sub_group_broadcast(b[1], 13), c[13]); + c[14] = fma(a[1], sub_group_broadcast(b[1], 14), c[14]); + c[15] = fma(a[1], sub_group_broadcast(b[1], 15), c[15]); + c[12] = fma(a[2], sub_group_broadcast(b[2], 12), c[12]); + c[13] = fma(a[2], sub_group_broadcast(b[2], 13), c[13]); + c[14] = fma(a[2], sub_group_broadcast(b[2], 14), c[14]); + c[15] = fma(a[2], sub_group_broadcast(b[2], 15), c[15]); + c[12] = fma(a[3], sub_group_broadcast(b[3], 12), c[12]); + c[13] = fma(a[3], sub_group_broadcast(b[3], 13), c[13]); + c[14] = fma(a[3], sub_group_broadcast(b[3], 14), c[14]); + c[15] = fma(a[3], sub_group_broadcast(b[3], 15), c[15]); + c[12] = fma(a[4], sub_group_broadcast(b[4], 12), c[12]); + c[13] = fma(a[4], sub_group_broadcast(b[4], 13), c[13]); + c[14] = fma(a[4], sub_group_broadcast(b[4], 14), c[14]); + c[15] = fma(a[4], sub_group_broadcast(b[4], 15), c[15]); + c[12] = fma(a[5], sub_group_broadcast(b[5], 12), c[12]); + c[13] = fma(a[5], sub_group_broadcast(b[5], 13), c[13]); + c[14] = fma(a[5], sub_group_broadcast(b[5], 14), c[14]); + c[15] = fma(a[5], sub_group_broadcast(b[5], 15), c[15]); + c[12] = fma(a[6], sub_group_broadcast(b[6], 12), c[12]); + c[13] = fma(a[6], sub_group_broadcast(b[6], 13), c[13]); + c[14] = fma(a[6], sub_group_broadcast(b[6], 14), c[14]); + c[15] = fma(a[6], sub_group_broadcast(b[6], 15), c[15]); + c[12] = fma(a[7], sub_group_broadcast(b[7], 12), c[12]); + c[13] = fma(a[7], sub_group_broadcast(b[7], 13), c[13]); + c[14] = fma(a[7], sub_group_broadcast(b[7], 14), c[14]); + c[15] = fma(a[7], sub_group_broadcast(b[7], 15), c[15]); + } + if (K - KmultipleKb > 0) { + __attribute__((opencl_unroll_hint(1))) for (short kb = KmultipleKb; + kb < K; kb += 1) { + double a[1]; + a[0] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + double b[1]; + b[0] = get_sub_group_local_id() < bs_1 ? Bb1[get_sub_group_local_id()] + : 0; + Bb1 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + } + } + global double *Cb = C + (blck1 + C_stride1 * blck); + for (short n = 0; n < bs_1; ++n) { + atomic_fetch_add_explicit( + (global volatile atomic_double *)(Cb + get_sub_group_local_id()), + alpha * c[n], memory_order_relaxed, memory_scope_work_group); + Cb += C_stride1; + } + } + if (rem1 > 0) { + blck1 = blocks1 * 16u; + if (sg_m == 0u) { + global double *Ab = A + blck1; + global double *Ab1 = Ab; + global double *Bb2 = Bb; + c[0] = 0x0p+0; + c[1] = 0x0p+0; + c[2] = 0x0p+0; + c[3] = 0x0p+0; + c[4] = 0x0p+0; + c[5] = 0x0p+0; + c[6] = 0x0p+0; + c[7] = 0x0p+0; + c[8] = 0x0p+0; + c[9] = 0x0p+0; + c[10] = 0x0p+0; + c[11] = 0x0p+0; + c[12] = 0x0p+0; + c[13] = 0x0p+0; + c[14] = 0x0p+0; + c[15] = 0x0p+0; + uint KmultipleKb = K / 8 * 8; + __attribute__((opencl_unroll_hint(1))) for (short kb = 0; + kb < KmultipleKb; kb += 8) { + double a[8]; + a[0] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[1] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[2] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[3] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[4] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[5] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[6] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[7] = get_sub_group_local_id() < rem1 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + double b[8]; + b[0] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[1] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[2] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[3] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[4] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[5] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[6] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + b[7] = get_sub_group_local_id() < bs_1 ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[0] = fma(a[1], sub_group_broadcast(b[1], 0), c[0]); + c[1] = fma(a[1], sub_group_broadcast(b[1], 1), c[1]); + c[2] = fma(a[1], sub_group_broadcast(b[1], 2), c[2]); + c[3] = fma(a[1], sub_group_broadcast(b[1], 3), c[3]); + c[0] = fma(a[2], sub_group_broadcast(b[2], 0), c[0]); + c[1] = fma(a[2], sub_group_broadcast(b[2], 1), c[1]); + c[2] = fma(a[2], sub_group_broadcast(b[2], 2), c[2]); + c[3] = fma(a[2], sub_group_broadcast(b[2], 3), c[3]); + c[0] = fma(a[3], sub_group_broadcast(b[3], 0), c[0]); + c[1] = fma(a[3], sub_group_broadcast(b[3], 1), c[1]); + c[2] = fma(a[3], sub_group_broadcast(b[3], 2), c[2]); + c[3] = fma(a[3], sub_group_broadcast(b[3], 3), c[3]); + c[0] = fma(a[4], sub_group_broadcast(b[4], 0), c[0]); + c[1] = fma(a[4], sub_group_broadcast(b[4], 1), c[1]); + c[2] = fma(a[4], sub_group_broadcast(b[4], 2), c[2]); + c[3] = fma(a[4], sub_group_broadcast(b[4], 3), c[3]); + c[0] = fma(a[5], sub_group_broadcast(b[5], 0), c[0]); + c[1] = fma(a[5], sub_group_broadcast(b[5], 1), c[1]); + c[2] = fma(a[5], sub_group_broadcast(b[5], 2), c[2]); + c[3] = fma(a[5], sub_group_broadcast(b[5], 3), c[3]); + c[0] = fma(a[6], sub_group_broadcast(b[6], 0), c[0]); + c[1] = fma(a[6], sub_group_broadcast(b[6], 1), c[1]); + c[2] = fma(a[6], sub_group_broadcast(b[6], 2), c[2]); + c[3] = fma(a[6], sub_group_broadcast(b[6], 3), c[3]); + c[0] = fma(a[7], sub_group_broadcast(b[7], 0), c[0]); + c[1] = fma(a[7], sub_group_broadcast(b[7], 1), c[1]); + c[2] = fma(a[7], sub_group_broadcast(b[7], 2), c[2]); + c[3] = fma(a[7], sub_group_broadcast(b[7], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[4] = fma(a[1], sub_group_broadcast(b[1], 4), c[4]); + c[5] = fma(a[1], sub_group_broadcast(b[1], 5), c[5]); + c[6] = fma(a[1], sub_group_broadcast(b[1], 6), c[6]); + c[7] = fma(a[1], sub_group_broadcast(b[1], 7), c[7]); + c[4] = fma(a[2], sub_group_broadcast(b[2], 4), c[4]); + c[5] = fma(a[2], sub_group_broadcast(b[2], 5), c[5]); + c[6] = fma(a[2], sub_group_broadcast(b[2], 6), c[6]); + c[7] = fma(a[2], sub_group_broadcast(b[2], 7), c[7]); + c[4] = fma(a[3], sub_group_broadcast(b[3], 4), c[4]); + c[5] = fma(a[3], sub_group_broadcast(b[3], 5), c[5]); + c[6] = fma(a[3], sub_group_broadcast(b[3], 6), c[6]); + c[7] = fma(a[3], sub_group_broadcast(b[3], 7), c[7]); + c[4] = fma(a[4], sub_group_broadcast(b[4], 4), c[4]); + c[5] = fma(a[4], sub_group_broadcast(b[4], 5), c[5]); + c[6] = fma(a[4], sub_group_broadcast(b[4], 6), c[6]); + c[7] = fma(a[4], sub_group_broadcast(b[4], 7), c[7]); + c[4] = fma(a[5], sub_group_broadcast(b[5], 4), c[4]); + c[5] = fma(a[5], sub_group_broadcast(b[5], 5), c[5]); + c[6] = fma(a[5], sub_group_broadcast(b[5], 6), c[6]); + c[7] = fma(a[5], sub_group_broadcast(b[5], 7), c[7]); + c[4] = fma(a[6], sub_group_broadcast(b[6], 4), c[4]); + c[5] = fma(a[6], sub_group_broadcast(b[6], 5), c[5]); + c[6] = fma(a[6], sub_group_broadcast(b[6], 6), c[6]); + c[7] = fma(a[6], sub_group_broadcast(b[6], 7), c[7]); + c[4] = fma(a[7], sub_group_broadcast(b[7], 4), c[4]); + c[5] = fma(a[7], sub_group_broadcast(b[7], 5), c[5]); + c[6] = fma(a[7], sub_group_broadcast(b[7], 6), c[6]); + c[7] = fma(a[7], sub_group_broadcast(b[7], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[8] = fma(a[1], sub_group_broadcast(b[1], 8), c[8]); + c[9] = fma(a[1], sub_group_broadcast(b[1], 9), c[9]); + c[10] = fma(a[1], sub_group_broadcast(b[1], 10), c[10]); + c[11] = fma(a[1], sub_group_broadcast(b[1], 11), c[11]); + c[8] = fma(a[2], sub_group_broadcast(b[2], 8), c[8]); + c[9] = fma(a[2], sub_group_broadcast(b[2], 9), c[9]); + c[10] = fma(a[2], sub_group_broadcast(b[2], 10), c[10]); + c[11] = fma(a[2], sub_group_broadcast(b[2], 11), c[11]); + c[8] = fma(a[3], sub_group_broadcast(b[3], 8), c[8]); + c[9] = fma(a[3], sub_group_broadcast(b[3], 9), c[9]); + c[10] = fma(a[3], sub_group_broadcast(b[3], 10), c[10]); + c[11] = fma(a[3], sub_group_broadcast(b[3], 11), c[11]); + c[8] = fma(a[4], sub_group_broadcast(b[4], 8), c[8]); + c[9] = fma(a[4], sub_group_broadcast(b[4], 9), c[9]); + c[10] = fma(a[4], sub_group_broadcast(b[4], 10), c[10]); + c[11] = fma(a[4], sub_group_broadcast(b[4], 11), c[11]); + c[8] = fma(a[5], sub_group_broadcast(b[5], 8), c[8]); + c[9] = fma(a[5], sub_group_broadcast(b[5], 9), c[9]); + c[10] = fma(a[5], sub_group_broadcast(b[5], 10), c[10]); + c[11] = fma(a[5], sub_group_broadcast(b[5], 11), c[11]); + c[8] = fma(a[6], sub_group_broadcast(b[6], 8), c[8]); + c[9] = fma(a[6], sub_group_broadcast(b[6], 9), c[9]); + c[10] = fma(a[6], sub_group_broadcast(b[6], 10), c[10]); + c[11] = fma(a[6], sub_group_broadcast(b[6], 11), c[11]); + c[8] = fma(a[7], sub_group_broadcast(b[7], 8), c[8]); + c[9] = fma(a[7], sub_group_broadcast(b[7], 9), c[9]); + c[10] = fma(a[7], sub_group_broadcast(b[7], 10), c[10]); + c[11] = fma(a[7], sub_group_broadcast(b[7], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + c[12] = fma(a[1], sub_group_broadcast(b[1], 12), c[12]); + c[13] = fma(a[1], sub_group_broadcast(b[1], 13), c[13]); + c[14] = fma(a[1], sub_group_broadcast(b[1], 14), c[14]); + c[15] = fma(a[1], sub_group_broadcast(b[1], 15), c[15]); + c[12] = fma(a[2], sub_group_broadcast(b[2], 12), c[12]); + c[13] = fma(a[2], sub_group_broadcast(b[2], 13), c[13]); + c[14] = fma(a[2], sub_group_broadcast(b[2], 14), c[14]); + c[15] = fma(a[2], sub_group_broadcast(b[2], 15), c[15]); + c[12] = fma(a[3], sub_group_broadcast(b[3], 12), c[12]); + c[13] = fma(a[3], sub_group_broadcast(b[3], 13), c[13]); + c[14] = fma(a[3], sub_group_broadcast(b[3], 14), c[14]); + c[15] = fma(a[3], sub_group_broadcast(b[3], 15), c[15]); + c[12] = fma(a[4], sub_group_broadcast(b[4], 12), c[12]); + c[13] = fma(a[4], sub_group_broadcast(b[4], 13), c[13]); + c[14] = fma(a[4], sub_group_broadcast(b[4], 14), c[14]); + c[15] = fma(a[4], sub_group_broadcast(b[4], 15), c[15]); + c[12] = fma(a[5], sub_group_broadcast(b[5], 12), c[12]); + c[13] = fma(a[5], sub_group_broadcast(b[5], 13), c[13]); + c[14] = fma(a[5], sub_group_broadcast(b[5], 14), c[14]); + c[15] = fma(a[5], sub_group_broadcast(b[5], 15), c[15]); + c[12] = fma(a[6], sub_group_broadcast(b[6], 12), c[12]); + c[13] = fma(a[6], sub_group_broadcast(b[6], 13), c[13]); + c[14] = fma(a[6], sub_group_broadcast(b[6], 14), c[14]); + c[15] = fma(a[6], sub_group_broadcast(b[6], 15), c[15]); + c[12] = fma(a[7], sub_group_broadcast(b[7], 12), c[12]); + c[13] = fma(a[7], sub_group_broadcast(b[7], 13), c[13]); + c[14] = fma(a[7], sub_group_broadcast(b[7], 14), c[14]); + c[15] = fma(a[7], sub_group_broadcast(b[7], 15), c[15]); + } + if (K - KmultipleKb > 0) { + __attribute__((opencl_unroll_hint(1))) for (short kb = KmultipleKb; + kb < K; kb += 1) { + double a[1]; + a[0] = get_sub_group_local_id() < rem1 + ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + double b[1]; + b[0] = get_sub_group_local_id() < bs_1 + ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + } + } + global double *Cb = C + (blck1 + C_stride1 * blck); + for (short n = 0; n < bs_1; ++n) { + if (get_sub_group_local_id() < rem1) { + atomic_fetch_add_explicit( + (global volatile atomic_double *)(Cb + + get_sub_group_local_id()), + alpha * c[n], memory_order_relaxed, memory_scope_work_group); + } + Cb += C_stride1; + } + } + } + } + __attribute__((opencl_unroll_hint(1))) for (blck = bs_1 * rem; blck < N; + blck += bs) { + global double *Bb = B + blck; + uint sg_m = 0; + uint blocks2 = M / 16u; + uint rem2 = M % 16u; + uint blck2; + __attribute__((opencl_unroll_hint(1))) for (blck2 = 16u * sg_m; + blck2 < 16u * blocks2; + blck2 += 16u) { + global double *Ab = A + blck2; + global double *Ab1 = Ab; + global double *Bb1 = Bb; + c[0] = 0x0p+0; + c[1] = 0x0p+0; + c[2] = 0x0p+0; + c[3] = 0x0p+0; + c[4] = 0x0p+0; + c[5] = 0x0p+0; + c[6] = 0x0p+0; + c[7] = 0x0p+0; + c[8] = 0x0p+0; + c[9] = 0x0p+0; + c[10] = 0x0p+0; + c[11] = 0x0p+0; + c[12] = 0x0p+0; + c[13] = 0x0p+0; + c[14] = 0x0p+0; + c[15] = 0x0p+0; + uint KmultipleKb = K / 8 * 8; + __attribute__((opencl_unroll_hint(1))) for (short kb = 0; + kb < KmultipleKb; kb += 8) { + double a[8]; + a[0] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[1] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[2] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[3] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[4] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[5] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[6] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + a[7] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + double b[8]; + b[0] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[1] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[2] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[3] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[4] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[5] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[6] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + b[7] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[0] = fma(a[1], sub_group_broadcast(b[1], 0), c[0]); + c[1] = fma(a[1], sub_group_broadcast(b[1], 1), c[1]); + c[2] = fma(a[1], sub_group_broadcast(b[1], 2), c[2]); + c[3] = fma(a[1], sub_group_broadcast(b[1], 3), c[3]); + c[0] = fma(a[2], sub_group_broadcast(b[2], 0), c[0]); + c[1] = fma(a[2], sub_group_broadcast(b[2], 1), c[1]); + c[2] = fma(a[2], sub_group_broadcast(b[2], 2), c[2]); + c[3] = fma(a[2], sub_group_broadcast(b[2], 3), c[3]); + c[0] = fma(a[3], sub_group_broadcast(b[3], 0), c[0]); + c[1] = fma(a[3], sub_group_broadcast(b[3], 1), c[1]); + c[2] = fma(a[3], sub_group_broadcast(b[3], 2), c[2]); + c[3] = fma(a[3], sub_group_broadcast(b[3], 3), c[3]); + c[0] = fma(a[4], sub_group_broadcast(b[4], 0), c[0]); + c[1] = fma(a[4], sub_group_broadcast(b[4], 1), c[1]); + c[2] = fma(a[4], sub_group_broadcast(b[4], 2), c[2]); + c[3] = fma(a[4], sub_group_broadcast(b[4], 3), c[3]); + c[0] = fma(a[5], sub_group_broadcast(b[5], 0), c[0]); + c[1] = fma(a[5], sub_group_broadcast(b[5], 1), c[1]); + c[2] = fma(a[5], sub_group_broadcast(b[5], 2), c[2]); + c[3] = fma(a[5], sub_group_broadcast(b[5], 3), c[3]); + c[0] = fma(a[6], sub_group_broadcast(b[6], 0), c[0]); + c[1] = fma(a[6], sub_group_broadcast(b[6], 1), c[1]); + c[2] = fma(a[6], sub_group_broadcast(b[6], 2), c[2]); + c[3] = fma(a[6], sub_group_broadcast(b[6], 3), c[3]); + c[0] = fma(a[7], sub_group_broadcast(b[7], 0), c[0]); + c[1] = fma(a[7], sub_group_broadcast(b[7], 1), c[1]); + c[2] = fma(a[7], sub_group_broadcast(b[7], 2), c[2]); + c[3] = fma(a[7], sub_group_broadcast(b[7], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[4] = fma(a[1], sub_group_broadcast(b[1], 4), c[4]); + c[5] = fma(a[1], sub_group_broadcast(b[1], 5), c[5]); + c[6] = fma(a[1], sub_group_broadcast(b[1], 6), c[6]); + c[7] = fma(a[1], sub_group_broadcast(b[1], 7), c[7]); + c[4] = fma(a[2], sub_group_broadcast(b[2], 4), c[4]); + c[5] = fma(a[2], sub_group_broadcast(b[2], 5), c[5]); + c[6] = fma(a[2], sub_group_broadcast(b[2], 6), c[6]); + c[7] = fma(a[2], sub_group_broadcast(b[2], 7), c[7]); + c[4] = fma(a[3], sub_group_broadcast(b[3], 4), c[4]); + c[5] = fma(a[3], sub_group_broadcast(b[3], 5), c[5]); + c[6] = fma(a[3], sub_group_broadcast(b[3], 6), c[6]); + c[7] = fma(a[3], sub_group_broadcast(b[3], 7), c[7]); + c[4] = fma(a[4], sub_group_broadcast(b[4], 4), c[4]); + c[5] = fma(a[4], sub_group_broadcast(b[4], 5), c[5]); + c[6] = fma(a[4], sub_group_broadcast(b[4], 6), c[6]); + c[7] = fma(a[4], sub_group_broadcast(b[4], 7), c[7]); + c[4] = fma(a[5], sub_group_broadcast(b[5], 4), c[4]); + c[5] = fma(a[5], sub_group_broadcast(b[5], 5), c[5]); + c[6] = fma(a[5], sub_group_broadcast(b[5], 6), c[6]); + c[7] = fma(a[5], sub_group_broadcast(b[5], 7), c[7]); + c[4] = fma(a[6], sub_group_broadcast(b[6], 4), c[4]); + c[5] = fma(a[6], sub_group_broadcast(b[6], 5), c[5]); + c[6] = fma(a[6], sub_group_broadcast(b[6], 6), c[6]); + c[7] = fma(a[6], sub_group_broadcast(b[6], 7), c[7]); + c[4] = fma(a[7], sub_group_broadcast(b[7], 4), c[4]); + c[5] = fma(a[7], sub_group_broadcast(b[7], 5), c[5]); + c[6] = fma(a[7], sub_group_broadcast(b[7], 6), c[6]); + c[7] = fma(a[7], sub_group_broadcast(b[7], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[8] = fma(a[1], sub_group_broadcast(b[1], 8), c[8]); + c[9] = fma(a[1], sub_group_broadcast(b[1], 9), c[9]); + c[10] = fma(a[1], sub_group_broadcast(b[1], 10), c[10]); + c[11] = fma(a[1], sub_group_broadcast(b[1], 11), c[11]); + c[8] = fma(a[2], sub_group_broadcast(b[2], 8), c[8]); + c[9] = fma(a[2], sub_group_broadcast(b[2], 9), c[9]); + c[10] = fma(a[2], sub_group_broadcast(b[2], 10), c[10]); + c[11] = fma(a[2], sub_group_broadcast(b[2], 11), c[11]); + c[8] = fma(a[3], sub_group_broadcast(b[3], 8), c[8]); + c[9] = fma(a[3], sub_group_broadcast(b[3], 9), c[9]); + c[10] = fma(a[3], sub_group_broadcast(b[3], 10), c[10]); + c[11] = fma(a[3], sub_group_broadcast(b[3], 11), c[11]); + c[8] = fma(a[4], sub_group_broadcast(b[4], 8), c[8]); + c[9] = fma(a[4], sub_group_broadcast(b[4], 9), c[9]); + c[10] = fma(a[4], sub_group_broadcast(b[4], 10), c[10]); + c[11] = fma(a[4], sub_group_broadcast(b[4], 11), c[11]); + c[8] = fma(a[5], sub_group_broadcast(b[5], 8), c[8]); + c[9] = fma(a[5], sub_group_broadcast(b[5], 9), c[9]); + c[10] = fma(a[5], sub_group_broadcast(b[5], 10), c[10]); + c[11] = fma(a[5], sub_group_broadcast(b[5], 11), c[11]); + c[8] = fma(a[6], sub_group_broadcast(b[6], 8), c[8]); + c[9] = fma(a[6], sub_group_broadcast(b[6], 9), c[9]); + c[10] = fma(a[6], sub_group_broadcast(b[6], 10), c[10]); + c[11] = fma(a[6], sub_group_broadcast(b[6], 11), c[11]); + c[8] = fma(a[7], sub_group_broadcast(b[7], 8), c[8]); + c[9] = fma(a[7], sub_group_broadcast(b[7], 9), c[9]); + c[10] = fma(a[7], sub_group_broadcast(b[7], 10), c[10]); + c[11] = fma(a[7], sub_group_broadcast(b[7], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + c[12] = fma(a[1], sub_group_broadcast(b[1], 12), c[12]); + c[13] = fma(a[1], sub_group_broadcast(b[1], 13), c[13]); + c[14] = fma(a[1], sub_group_broadcast(b[1], 14), c[14]); + c[15] = fma(a[1], sub_group_broadcast(b[1], 15), c[15]); + c[12] = fma(a[2], sub_group_broadcast(b[2], 12), c[12]); + c[13] = fma(a[2], sub_group_broadcast(b[2], 13), c[13]); + c[14] = fma(a[2], sub_group_broadcast(b[2], 14), c[14]); + c[15] = fma(a[2], sub_group_broadcast(b[2], 15), c[15]); + c[12] = fma(a[3], sub_group_broadcast(b[3], 12), c[12]); + c[13] = fma(a[3], sub_group_broadcast(b[3], 13), c[13]); + c[14] = fma(a[3], sub_group_broadcast(b[3], 14), c[14]); + c[15] = fma(a[3], sub_group_broadcast(b[3], 15), c[15]); + c[12] = fma(a[4], sub_group_broadcast(b[4], 12), c[12]); + c[13] = fma(a[4], sub_group_broadcast(b[4], 13), c[13]); + c[14] = fma(a[4], sub_group_broadcast(b[4], 14), c[14]); + c[15] = fma(a[4], sub_group_broadcast(b[4], 15), c[15]); + c[12] = fma(a[5], sub_group_broadcast(b[5], 12), c[12]); + c[13] = fma(a[5], sub_group_broadcast(b[5], 13), c[13]); + c[14] = fma(a[5], sub_group_broadcast(b[5], 14), c[14]); + c[15] = fma(a[5], sub_group_broadcast(b[5], 15), c[15]); + c[12] = fma(a[6], sub_group_broadcast(b[6], 12), c[12]); + c[13] = fma(a[6], sub_group_broadcast(b[6], 13), c[13]); + c[14] = fma(a[6], sub_group_broadcast(b[6], 14), c[14]); + c[15] = fma(a[6], sub_group_broadcast(b[6], 15), c[15]); + c[12] = fma(a[7], sub_group_broadcast(b[7], 12), c[12]); + c[13] = fma(a[7], sub_group_broadcast(b[7], 13), c[13]); + c[14] = fma(a[7], sub_group_broadcast(b[7], 14), c[14]); + c[15] = fma(a[7], sub_group_broadcast(b[7], 15), c[15]); + } + if (K - KmultipleKb > 0) { + __attribute__((opencl_unroll_hint(1))) for (short kb = KmultipleKb; + kb < K; kb += 1) { + double a[1]; + a[0] = as_double(intel_sub_group_block_read_ul((global ulong *)Ab1)); + Ab1 += A_stride1; + double b[1]; + b[0] = + get_sub_group_local_id() < bs ? Bb1[get_sub_group_local_id()] : 0; + Bb1 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + } + } + global double *Cb = C + (blck2 + C_stride1 * blck); + for (short n = 0; n < bs; ++n) { + atomic_fetch_add_explicit( + (global volatile atomic_double *)(Cb + get_sub_group_local_id()), + alpha * c[n], memory_order_relaxed, memory_scope_work_group); + Cb += C_stride1; + } + } + if (rem2 > 0) { + blck2 = blocks2 * 16u; + if (sg_m == 0u) { + global double *Ab = A + blck2; + global double *Ab1 = Ab; + global double *Bb2 = Bb; + c[0] = 0x0p+0; + c[1] = 0x0p+0; + c[2] = 0x0p+0; + c[3] = 0x0p+0; + c[4] = 0x0p+0; + c[5] = 0x0p+0; + c[6] = 0x0p+0; + c[7] = 0x0p+0; + c[8] = 0x0p+0; + c[9] = 0x0p+0; + c[10] = 0x0p+0; + c[11] = 0x0p+0; + c[12] = 0x0p+0; + c[13] = 0x0p+0; + c[14] = 0x0p+0; + c[15] = 0x0p+0; + uint KmultipleKb = K / 8 * 8; + __attribute__((opencl_unroll_hint(1))) for (short kb = 0; + kb < KmultipleKb; kb += 8) { + double a[8]; + a[0] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[1] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[2] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[3] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[4] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[5] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[6] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + a[7] = get_sub_group_local_id() < rem2 ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + double b[8]; + b[0] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[1] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[2] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[3] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[4] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[5] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[6] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + b[7] = + get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] : 0; + Bb2 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[0] = fma(a[1], sub_group_broadcast(b[1], 0), c[0]); + c[1] = fma(a[1], sub_group_broadcast(b[1], 1), c[1]); + c[2] = fma(a[1], sub_group_broadcast(b[1], 2), c[2]); + c[3] = fma(a[1], sub_group_broadcast(b[1], 3), c[3]); + c[0] = fma(a[2], sub_group_broadcast(b[2], 0), c[0]); + c[1] = fma(a[2], sub_group_broadcast(b[2], 1), c[1]); + c[2] = fma(a[2], sub_group_broadcast(b[2], 2), c[2]); + c[3] = fma(a[2], sub_group_broadcast(b[2], 3), c[3]); + c[0] = fma(a[3], sub_group_broadcast(b[3], 0), c[0]); + c[1] = fma(a[3], sub_group_broadcast(b[3], 1), c[1]); + c[2] = fma(a[3], sub_group_broadcast(b[3], 2), c[2]); + c[3] = fma(a[3], sub_group_broadcast(b[3], 3), c[3]); + c[0] = fma(a[4], sub_group_broadcast(b[4], 0), c[0]); + c[1] = fma(a[4], sub_group_broadcast(b[4], 1), c[1]); + c[2] = fma(a[4], sub_group_broadcast(b[4], 2), c[2]); + c[3] = fma(a[4], sub_group_broadcast(b[4], 3), c[3]); + c[0] = fma(a[5], sub_group_broadcast(b[5], 0), c[0]); + c[1] = fma(a[5], sub_group_broadcast(b[5], 1), c[1]); + c[2] = fma(a[5], sub_group_broadcast(b[5], 2), c[2]); + c[3] = fma(a[5], sub_group_broadcast(b[5], 3), c[3]); + c[0] = fma(a[6], sub_group_broadcast(b[6], 0), c[0]); + c[1] = fma(a[6], sub_group_broadcast(b[6], 1), c[1]); + c[2] = fma(a[6], sub_group_broadcast(b[6], 2), c[2]); + c[3] = fma(a[6], sub_group_broadcast(b[6], 3), c[3]); + c[0] = fma(a[7], sub_group_broadcast(b[7], 0), c[0]); + c[1] = fma(a[7], sub_group_broadcast(b[7], 1), c[1]); + c[2] = fma(a[7], sub_group_broadcast(b[7], 2), c[2]); + c[3] = fma(a[7], sub_group_broadcast(b[7], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[4] = fma(a[1], sub_group_broadcast(b[1], 4), c[4]); + c[5] = fma(a[1], sub_group_broadcast(b[1], 5), c[5]); + c[6] = fma(a[1], sub_group_broadcast(b[1], 6), c[6]); + c[7] = fma(a[1], sub_group_broadcast(b[1], 7), c[7]); + c[4] = fma(a[2], sub_group_broadcast(b[2], 4), c[4]); + c[5] = fma(a[2], sub_group_broadcast(b[2], 5), c[5]); + c[6] = fma(a[2], sub_group_broadcast(b[2], 6), c[6]); + c[7] = fma(a[2], sub_group_broadcast(b[2], 7), c[7]); + c[4] = fma(a[3], sub_group_broadcast(b[3], 4), c[4]); + c[5] = fma(a[3], sub_group_broadcast(b[3], 5), c[5]); + c[6] = fma(a[3], sub_group_broadcast(b[3], 6), c[6]); + c[7] = fma(a[3], sub_group_broadcast(b[3], 7), c[7]); + c[4] = fma(a[4], sub_group_broadcast(b[4], 4), c[4]); + c[5] = fma(a[4], sub_group_broadcast(b[4], 5), c[5]); + c[6] = fma(a[4], sub_group_broadcast(b[4], 6), c[6]); + c[7] = fma(a[4], sub_group_broadcast(b[4], 7), c[7]); + c[4] = fma(a[5], sub_group_broadcast(b[5], 4), c[4]); + c[5] = fma(a[5], sub_group_broadcast(b[5], 5), c[5]); + c[6] = fma(a[5], sub_group_broadcast(b[5], 6), c[6]); + c[7] = fma(a[5], sub_group_broadcast(b[5], 7), c[7]); + c[4] = fma(a[6], sub_group_broadcast(b[6], 4), c[4]); + c[5] = fma(a[6], sub_group_broadcast(b[6], 5), c[5]); + c[6] = fma(a[6], sub_group_broadcast(b[6], 6), c[6]); + c[7] = fma(a[6], sub_group_broadcast(b[6], 7), c[7]); + c[4] = fma(a[7], sub_group_broadcast(b[7], 4), c[4]); + c[5] = fma(a[7], sub_group_broadcast(b[7], 5), c[5]); + c[6] = fma(a[7], sub_group_broadcast(b[7], 6), c[6]); + c[7] = fma(a[7], sub_group_broadcast(b[7], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[8] = fma(a[1], sub_group_broadcast(b[1], 8), c[8]); + c[9] = fma(a[1], sub_group_broadcast(b[1], 9), c[9]); + c[10] = fma(a[1], sub_group_broadcast(b[1], 10), c[10]); + c[11] = fma(a[1], sub_group_broadcast(b[1], 11), c[11]); + c[8] = fma(a[2], sub_group_broadcast(b[2], 8), c[8]); + c[9] = fma(a[2], sub_group_broadcast(b[2], 9), c[9]); + c[10] = fma(a[2], sub_group_broadcast(b[2], 10), c[10]); + c[11] = fma(a[2], sub_group_broadcast(b[2], 11), c[11]); + c[8] = fma(a[3], sub_group_broadcast(b[3], 8), c[8]); + c[9] = fma(a[3], sub_group_broadcast(b[3], 9), c[9]); + c[10] = fma(a[3], sub_group_broadcast(b[3], 10), c[10]); + c[11] = fma(a[3], sub_group_broadcast(b[3], 11), c[11]); + c[8] = fma(a[4], sub_group_broadcast(b[4], 8), c[8]); + c[9] = fma(a[4], sub_group_broadcast(b[4], 9), c[9]); + c[10] = fma(a[4], sub_group_broadcast(b[4], 10), c[10]); + c[11] = fma(a[4], sub_group_broadcast(b[4], 11), c[11]); + c[8] = fma(a[5], sub_group_broadcast(b[5], 8), c[8]); + c[9] = fma(a[5], sub_group_broadcast(b[5], 9), c[9]); + c[10] = fma(a[5], sub_group_broadcast(b[5], 10), c[10]); + c[11] = fma(a[5], sub_group_broadcast(b[5], 11), c[11]); + c[8] = fma(a[6], sub_group_broadcast(b[6], 8), c[8]); + c[9] = fma(a[6], sub_group_broadcast(b[6], 9), c[9]); + c[10] = fma(a[6], sub_group_broadcast(b[6], 10), c[10]); + c[11] = fma(a[6], sub_group_broadcast(b[6], 11), c[11]); + c[8] = fma(a[7], sub_group_broadcast(b[7], 8), c[8]); + c[9] = fma(a[7], sub_group_broadcast(b[7], 9), c[9]); + c[10] = fma(a[7], sub_group_broadcast(b[7], 10), c[10]); + c[11] = fma(a[7], sub_group_broadcast(b[7], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + c[12] = fma(a[1], sub_group_broadcast(b[1], 12), c[12]); + c[13] = fma(a[1], sub_group_broadcast(b[1], 13), c[13]); + c[14] = fma(a[1], sub_group_broadcast(b[1], 14), c[14]); + c[15] = fma(a[1], sub_group_broadcast(b[1], 15), c[15]); + c[12] = fma(a[2], sub_group_broadcast(b[2], 12), c[12]); + c[13] = fma(a[2], sub_group_broadcast(b[2], 13), c[13]); + c[14] = fma(a[2], sub_group_broadcast(b[2], 14), c[14]); + c[15] = fma(a[2], sub_group_broadcast(b[2], 15), c[15]); + c[12] = fma(a[3], sub_group_broadcast(b[3], 12), c[12]); + c[13] = fma(a[3], sub_group_broadcast(b[3], 13), c[13]); + c[14] = fma(a[3], sub_group_broadcast(b[3], 14), c[14]); + c[15] = fma(a[3], sub_group_broadcast(b[3], 15), c[15]); + c[12] = fma(a[4], sub_group_broadcast(b[4], 12), c[12]); + c[13] = fma(a[4], sub_group_broadcast(b[4], 13), c[13]); + c[14] = fma(a[4], sub_group_broadcast(b[4], 14), c[14]); + c[15] = fma(a[4], sub_group_broadcast(b[4], 15), c[15]); + c[12] = fma(a[5], sub_group_broadcast(b[5], 12), c[12]); + c[13] = fma(a[5], sub_group_broadcast(b[5], 13), c[13]); + c[14] = fma(a[5], sub_group_broadcast(b[5], 14), c[14]); + c[15] = fma(a[5], sub_group_broadcast(b[5], 15), c[15]); + c[12] = fma(a[6], sub_group_broadcast(b[6], 12), c[12]); + c[13] = fma(a[6], sub_group_broadcast(b[6], 13), c[13]); + c[14] = fma(a[6], sub_group_broadcast(b[6], 14), c[14]); + c[15] = fma(a[6], sub_group_broadcast(b[6], 15), c[15]); + c[12] = fma(a[7], sub_group_broadcast(b[7], 12), c[12]); + c[13] = fma(a[7], sub_group_broadcast(b[7], 13), c[13]); + c[14] = fma(a[7], sub_group_broadcast(b[7], 14), c[14]); + c[15] = fma(a[7], sub_group_broadcast(b[7], 15), c[15]); + } + if (K - KmultipleKb > 0) { + __attribute__((opencl_unroll_hint(1))) for (short kb = KmultipleKb; + kb < K; kb += 1) { + double a[1]; + a[0] = get_sub_group_local_id() < rem2 + ? Ab1[get_sub_group_local_id()] + : 0; + Ab1 += A_stride1; + double b[1]; + b[0] = get_sub_group_local_id() < bs ? Bb2[get_sub_group_local_id()] + : 0; + Bb2 += B_stride1; + c[0] = fma(a[0], sub_group_broadcast(b[0], 0), c[0]); + c[1] = fma(a[0], sub_group_broadcast(b[0], 1), c[1]); + c[2] = fma(a[0], sub_group_broadcast(b[0], 2), c[2]); + c[3] = fma(a[0], sub_group_broadcast(b[0], 3), c[3]); + c[4] = fma(a[0], sub_group_broadcast(b[0], 4), c[4]); + c[5] = fma(a[0], sub_group_broadcast(b[0], 5), c[5]); + c[6] = fma(a[0], sub_group_broadcast(b[0], 6), c[6]); + c[7] = fma(a[0], sub_group_broadcast(b[0], 7), c[7]); + c[8] = fma(a[0], sub_group_broadcast(b[0], 8), c[8]); + c[9] = fma(a[0], sub_group_broadcast(b[0], 9), c[9]); + c[10] = fma(a[0], sub_group_broadcast(b[0], 10), c[10]); + c[11] = fma(a[0], sub_group_broadcast(b[0], 11), c[11]); + c[12] = fma(a[0], sub_group_broadcast(b[0], 12), c[12]); + c[13] = fma(a[0], sub_group_broadcast(b[0], 13), c[13]); + c[14] = fma(a[0], sub_group_broadcast(b[0], 14), c[14]); + c[15] = fma(a[0], sub_group_broadcast(b[0], 15), c[15]); + } + } + global double *Cb = C + (blck2 + C_stride1 * blck); + for (short n = 0; n < bs; ++n) { + if (get_sub_group_local_id() < rem2) { + atomic_fetch_add_explicit( + (global volatile atomic_double *)(Cb + + get_sub_group_local_id()), + alpha * c[n], memory_order_relaxed, memory_scope_work_group); + } + Cb += C_stride1; + } + } + } + } +} +kernel __attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(16))) void +dbm_multiply(double alpha, int itask, global int *tasks, long tasks_shape1, + global double *A, long A_shape0, global double *B, long B_shape0, + global double *C, long C_shape0) { + long gid = get_global_id(2); + long itask_idx = (long)itask; + long tid = itask_idx + gid; + int iM = *(tasks + 0ll * 1 + tid * 6); + int iN = *(tasks + 1ll * 1 + tid * 6); + int iK = *(tasks + 2ll * 1 + tid * 6); + int ioffset_a = *(tasks + 3ll * 1 + tid * 6); + int ioffset_b = *(tasks + 4ll * 1 + tid * 6); + int ioffset_c = *(tasks + 5ll * 1 + tid * 6); + long M = (long)iM; + long N = (long)iN; + long K = (long)iK; + long offset_a = (long)ioffset_a; + long offset_b = (long)ioffset_b; + long offset_c = (long)ioffset_c; + long MK = M * K; + long KN = K * N; + long MN = M * N; + global double *av = A + offset_a * 1; + long av_shape0 = MK; + global double *bv = B + offset_b * 1; + long bv_shape0 = KN; + global double *cv = C + offset_c * 1; + long cv_shape0 = MN; + global double *a = av; + long a_shape0 = M; + long a_shape1 = K; + long a_stride1 = 1 * M; + global double *b = bv; + long b_shape0 = N; + long b_shape1 = K; + long b_stride1 = 1 * N; + global double *c = cv; + long c_shape0 = M; + long c_shape1 = N; + long c_stride1 = 1 * M; + gemm_atomic_f64f64f64f64f64_An_Bt_Md_Nd_Kd_Astride1_d_Bstride1_d_Cstride1_d_alphad_beta3ff0000000000000( + c_shape0, c_shape1, a_shape1, alpha, a, 1, a_stride1, b, 1, b_stride1, + 0x1p+0, c, 1, c_stride1); +} diff --git a/src/dbm/dbm_multiply_opencl.md b/src/dbm/dbm_multiply_opencl.md new file mode 100644 index 0000000000..3ae8087806 --- /dev/null +++ b/src/dbm/dbm_multiply_opencl.md @@ -0,0 +1,35 @@ +# OpenCL Backend + +DBM processes batches of matrix multiplications which may look like (task): + +```C +typedef struct { + int m; + int n; + int k; + int offset_a; + int offset_b; + int offset_c; +} dbm_task_t; +``` + +Each task is characterized by an M, N, and K parameter as well as offsets pointing into compact data +arrays representing the A, B, and C matrices of an entire batch. The OpenCL backend provides a +universal kernel for a general range of M, N, and K parameters. + +The OpenCL backend shares the same data structures (header files) and the same input format as used +for other backends namely CUDA and HIP. + +## Tiny Tensor Compiler + +The [Tiny Tensor Compiler](https://github.com/intel/tiny-tensor-compiler) aims for tensor +computations on GPUs and other devices, supporting the OpenCL, Level Zero, and SYCL runtime. + +The compiler takes a textual and human readable Intermediate Representation (IR) and generates for +instance an OpenCL kernel. Provided the [IR-file](dbm_multiply_opencl.ir), the +[generated kernel](dbm_multiply_opencl.irh) is an optional code-path implemented by the OpenCL +backend (in addition to the universal kernel). + +If the OpenCL runtime supports the necessary features, the kernel generated by the Tiny Tensor +Compiler is used by default (an environment variable `DBM_MULTIPLY_GEN=0` can enforce the universal +kernel). diff --git a/tools/precommit/check_file_properties.py b/tools/precommit/check_file_properties.py index 8beec138bd..d53093e7b9 100755 --- a/tools/precommit/check_file_properties.py +++ b/tools/precommit/check_file_properties.py @@ -26,7 +26,7 @@ r"CUDA_VERSION", r"DBM_LIBXSMM_PREFETCH", r"DBM_VALIDATE_AGAINST_DBCSR", - r"OPENCL_DBM_SOURCE_MULTIPLY", + r"OPENCL_DBM_..*", r"FD_DEBUG", r"GRID_DO_COLLOCATE", r"INTEL_MKL_VERSION",