diff --git a/VERSION b/VERSION index 1a1fb46b0d8..e6387dd184c 100644 --- a/VERSION +++ b/VERSION @@ -1,8 +1,8 @@ MAJOR = 2 MINOR = 3 -PATCH = 0-rc3 +PATCH = 0 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise # it is considered Development version. -DATE = 2022-06-08 +DATE = 2022-06-26 diff --git a/src/acc/libsmm_acc/notebooks/requirements.txt b/src/acc/libsmm_acc/notebooks/requirements.txt index 7549e6ca84d..f36ef7a07a0 100644 --- a/src/acc/libsmm_acc/notebooks/requirements.txt +++ b/src/acc/libsmm_acc/notebooks/requirements.txt @@ -1,6 +1,6 @@ bokeh==1.0.4 matplotlib==3.0.2 -numpy==1.21.0 +numpy==1.22.0 pandas==0.23.4 pandas-profiling==1.4.1 seaborn==0.9.0 diff --git a/src/acc/libsmm_acc/tune/requirements.txt b/src/acc/libsmm_acc/tune/requirements.txt index 819f6377b80..23641586c2d 100644 --- a/src/acc/libsmm_acc/tune/requirements.txt +++ b/src/acc/libsmm_acc/tune/requirements.txt @@ -1 +1 @@ -numpy==1.21.0 +numpy==1.22.0 diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 3d79cba97cd..53dda0e2c06 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -217,7 +217,7 @@ int c_dbcsr_acc_init(void) { char buffer[ACC_OPENCL_BUFFERSIZE]; const char *const env_devmatch = getenv("ACC_OPENCL_DEVMATCH"), *const env_devtype = getenv("ACC_OPENCL_DEVTYPE"); const char *const env_priority = getenv("ACC_OPENCL_PRIORITY"), *const env_xhints = getenv("ACC_OPENCL_XHINTS"); - const char *const env_nullify = getenv("ACC_OPENCL_NULLIFY"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP"); + const char *const env_devcopy = getenv("ACC_OPENCL_DEVCOPY"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP"); const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"), *const env_flush = getenv("ACC_OPENCL_FLUSH"); const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_timer = getenv("ACC_OPENCL_TIMER"); const char *const env_share = getenv("ACC_OPENCL_SHARE"), *const env_async = getenv("ACC_OPENCL_ASYNC"); @@ -233,7 +233,7 @@ int c_dbcsr_acc_init(void) { # endif c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose)); c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority)); - c_dbcsr_acc_opencl_config.nullify = (NULL == env_nullify ? /*default*/ 0 : atoi(env_nullify)); + c_dbcsr_acc_opencl_config.devcopy = (NULL == env_devcopy ? /*default*/ 0 : atoi(env_devcopy)); c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? /*default*/ 1 : atoi(env_xhints)); c_dbcsr_acc_opencl_config.share = (NULL == env_share ? /*default*/ 0 : atoi(env_share)); c_dbcsr_acc_opencl_config.async = (NULL == env_async ? /*default*/ 3 : atoi(env_async)); @@ -557,7 +557,7 @@ int c_dbcsr_acc_finalize(void) { for (i = 0; i < ACC_OPENCL_DEVICES_MAXCOUNT; ++i) { const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[i]; if (NULL != device_id) { -# if defined(CL_VERSION_1_2) +# if defined(CL_VERSION_1_2) && defined(_DEBUG) ACC_OPENCL_CHECK(clReleaseDevice(device_id), "release device", result); # endif /* c_dbcsr_acc_opencl_create_context scans for non-NULL devices */ diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 021f80ce24c..0df7f790078 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -223,8 +223,8 @@ typedef struct c_dbcsr_acc_opencl_config_t { cl_int nthreads; /** How to apply/use stream priorities. */ cl_int priority; - /** How to zero device-side buffers. */ - cl_int nullify; + /** How to zero/copy device-side buffers. */ + cl_int devcopy; /** Execution-hints (command stream). */ cl_int xhints; /** Share streams across threads. */ diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 373480274f4..c2c6c6c5898 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -224,7 +224,9 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) { # endif clReleaseMemObject(buffer); # if defined(CL_VERSION_2_0) - /*if (NULL != ptr)*/ clSVMFree(context, ptr); + if (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop /*&& (NULL != ptr)*/) { + clSVMFree(context, ptr); + } # endif result = EXIT_FAILURE; } @@ -271,8 +273,10 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { } # endif # if defined(CL_VERSION_2_0) - assert(NULL != c_dbcsr_acc_opencl_config.device[tid].context); - clSVMFree(c_dbcsr_acc_opencl_config.device[tid].context, ptr); /*if (NULL != ptr)*/ + if (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop /*&& (NULL != ptr)*/) { + assert(NULL != c_dbcsr_acc_opencl_config.device[tid].context); + clSVMFree(c_dbcsr_acc_opencl_config.device[tid].context, ptr); + } # endif } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -363,8 +367,36 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt # endif assert((NULL != devmem_src || 0 == nbytes) && (NULL != devmem_dst || 0 == nbytes) && NULL != stream); if (NULL != devmem_src && NULL != devmem_dst && 0 != nbytes) { - result = clEnqueueCopyBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(devmem_src), *ACC_OPENCL_MEM(devmem_dst), - 0 /*src_offset*/, 0 /*dst_offset*/, nbytes, 0, NULL, NULL); + const cl_mem *const src = ACC_OPENCL_MEM(devmem_src), *const dst = ACC_OPENCL_MEM(devmem_dst); + assert(NULL != *src && NULL != *dst); + if (*src != *dst) { + const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); + if (0 == (2 & c_dbcsr_acc_opencl_config.devcopy)) { + result = clEnqueueCopyBuffer(queue, *src, *dst, 0 /*src_offset*/, 0 /*dst_offset*/, nbytes, 0, NULL, NULL); + } + else { + static volatile int lock; /* creating cl_kernel and clSetKernelArg must be synchronized */ + static cl_kernel kernel = NULL; + LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); + if (NULL == kernel) { /* generate kernel */ + const char source[] = "kernel void memcpy_d2d(global uchar *restrict src, global uchar *restrict dst) {\n" + " const size_t i = get_global_id(0);\n" + " dst[i] = src[i];\n" + "}\n"; + result = c_dbcsr_acc_opencl_kernel(source, "memcpy_d2d" /*kernel_name*/, NULL /*build_params*/, NULL /*build_options*/, + NULL /*try_build_options*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &kernel); + } + if (EXIT_SUCCESS == result) { + assert(NULL != kernel); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), src), "set src argument of memcpy_d2d kernel", result); + ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), dst), "set dst argument of memcpy_d2d kernel", result); + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel( + queue, kernel, 1 /*work_dim*/, NULL /*offset*/, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL), + "launch memcpy_d2d kernel", result); + } + LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED); + } + } } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -385,7 +417,7 @@ int c_dbcsr_acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* s if (0 != nbytes) { const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); const cl_mem* const buffer = ACC_OPENCL_MEM(dev_mem); - if (0 == c_dbcsr_acc_opencl_config.nullify) { + if (0 == (1 & c_dbcsr_acc_opencl_config.devcopy)) { static const cl_uchar pattern = 0; /* fill with zeros */ result = clEnqueueFillBuffer(queue, *buffer, &pattern, sizeof(pattern), offset, nbytes, 0, NULL, NULL); } diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index f1fa28106eb..1428e50aa8f 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -891,8 +891,8 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v clSetKernelArg(config->kernel, 1, sizeof(int), &offset), "set offset argument of transpose kernel", result); ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_data)), "set matrix-data argument of transpose kernel", result); - ACC_OPENCL_CHECK( - clEnqueueNDRangeKernel(queue, config->kernel, 1 /*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, perf_event), + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel, 1 /*work_dim*/, NULL /*offset*/, &work_size, &config->wgsize, + 0, NULL, perf_event), "launch transpose kernel", result); /* eventually update performance counters inside of locked region */ # if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) @@ -1635,7 +1635,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ACC_OPENCL_CHECK( clSetKernelArg(config->kernel[kernel_idx], 5, sizeof(int), &bs), "set minibatch argument of SMM-kernel", result); } - ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL, &work_size, + ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL /*offset*/, &work_size, config->wgsize + kernel_idx, 0, NULL, perf_event), "launch SMM-kernel", result); /* eventually update performance counters inside of locked region */