BlueBrain · pramodk · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 10, 2021
diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
@@ -50,6 +50,11 @@ list(REMOVE_ITEM CORENRN_LINK_LIBS "Threads::Threads")
 
 # replicate CMake magic to transform system libs to -l<libname>
 foreach(link_lib ${CORENRN_LINK_LIBS})
+  if(TARGET ${link_lib})
+    get_property(link_flag TARGET ${link_lib} PROPERTY INTERFACE_LINK_LIBRARIES)
+    string(APPEND CORENRN_COMMON_LDFLAGS " ${link_flag}")
+    continue()
+  endif()
   if(${link_lib} MATCHES "\-l.*")
     string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
     continue()

diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
@@ -42,45 +42,68 @@ if(CORENRN_ENABLE_GPU)
     # ${CMAKE_CUDA_COMPILER_VERSION}
     find_package(CUDAToolkit 9.0 REQUIRED)
     # Be a bit paranoid
-    if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION})
-      message(
-        FATAL_ERROR
-          "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!"
-      )
-    endif()
+    # if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION})
+    #   message(
+    #     FATAL_ERROR
+    #       "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!"
+    #   )
+    # endif()
     set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
   endif()
-  # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
-  # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
-  # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
-  # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
-  # code is compiled with the same CUDA version as the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
-  # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
-  # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
-  # same default compute capabilities as each other, particularly on GPU-less build machines.
-  foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
-    string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
-  endforeach()
-  if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
-    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
-    # for a region then prefer OpenMP.
-    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
-    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
-  elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
-    # Only enable OpenACC offload for GPU
-    string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
+  if(CORENRN_HAVE_NVHPC_COMPILER)
+    # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
+    # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
+    # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
+    # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
+    # code is compiled with the same CUDA version as the explicit CUDA code.
+    set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+    # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
+    # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
+    # same default compute capabilities as each other, particularly on GPU-less build machines.
+    foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
+      string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
+    endforeach()
+    if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
+      # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+      # for a region then prefer OpenMP.
+      add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+      string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
+    elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
+      # Only enable OpenACC offload for GPU
+      string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
+    else()
+      message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
+    endif()
+  elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "XLClang")
+    if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
+      set(NVHPC_ACC_COMP_FLAGS "-qsmp=omp -qoffload -qreport")
+      set(NVHPC_ACC_LINK_FLAGS "-qcuda -lcaliper")
+      # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+      # for a region then prefer OpenMP.
+      add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    else()
+      message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with XLClang compilers")
+    endif()
+  elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
+      set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
+      # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+      # for a region then prefer OpenMP.
+      add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    else()
+      message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with Clang compilers")
+    endif()
   else()
-    message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
+    message(FATAL_ERROR "${CMAKE_CXX_COMPILER_ID} is not supported in GPU builds.")
   endif()
   # avoid PGI adding standard compliant "-A" flags
-  set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
-  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
+  # set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
   # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
   # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
   # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
   # for OpenACC code generation.
   set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
+  set(NVHPC_CXX_INLINE_FLAGS)
 endif()
 
 # =============================================================================

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -127,13 +127,13 @@ if(CORENRN_ENABLE_GPU)
   # CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 
   # Fail hard and early if we don't have the PGI/NVHPC compiler.
-  if(NOT CORENRN_HAVE_NVHPC_COMPILER)
-    message(
-      FATAL_ERROR
-        "GPU support is available via OpenACC using PGI/NVIDIA compilers."
-        " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++"
-    )
-  endif()
+  # if(NOT CORENRN_HAVE_NVHPC_COMPILER)
+  #   message(
+  #     FATAL_ERROR
+  #       "GPU support is available via OpenACC using PGI/NVIDIA compilers."
+  #       " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++"
+  #   )
+  # endif()
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -184,9 +184,9 @@ if(CORENRN_ENABLE_GPU)
   # 3057 : Warning on ignoring __host__ annotation in some functions
   # 3085 : Warning on redeclaring a __host__ function as __host__ __device__
   # ~~~
-  set(CMAKE_CUDA_FLAGS
-      "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
-  )
+  # set(CMAKE_CUDA_FLAGS
+  #     "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
+  # )
 
   if(CORENRN_ENABLE_NMODL)
     # NMODL supports both OpenACC and OpenMP target offload
@@ -419,6 +419,9 @@ endif()
 # =============================================================================
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
 
+# Set coreneuron specific linker flag after NMODL submodule is added
+string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
+
 # =============================================================================
 # Add main directories
 # =============================================================================

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
@@ -32,6 +32,9 @@ file(
   "utils/*/*.c"
   "utils/*/*.cpp")
 file(GLOB_RECURSE CORENEURON_CUDA_FILES "*.cu")
+list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu")
+list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/profile/cuda_profile.cu")
+list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu")
 set(SCOPMATH_CODE_FILES
     "sim/scopmath/abort.cpp" "sim/scopmath/crout_thread.cpp" "sim/scopmath/newton_thread.cpp"
     "sim/scopmath/sparse_thread.cpp" "sim/scopmath/ssimplic_thread.cpp")
@@ -129,16 +132,16 @@ if(CORENRN_ENABLE_GPU)
   # CMake <v3.20 does not pass explicit -x <lang> options based on the LANGUAGE property
   # (https://cmake.org/cmake/help/latest/policy/CMP0119.html), so using a single .cu file and
   # setting LANGUAGE=CXX in non-GPU builds does not work.
-  list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
+  #list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
   list(APPEND CORENEURON_CODE_FILES ${CORENEURON_CUDA_FILES})
 
   # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
   # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
   # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
   # these functions from CUDA kernels presents no issue ...
-  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
-  endif()
+  # if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+  #   list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
+  # endif()
 endif()
 
 # =============================================================================
@@ -198,6 +201,8 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
   target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
 endif()
 
+target_link_libraries(coreneuron CUDA::cudart)
+
 # this is where we handle dynamic mpi library build
 if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
   # ~~~

diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
@@ -490,6 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     bool has_subtrees_to_compute = true;
 
     // clang-format off
+    // OL211207: check if we need an OpenMP directive here.
     nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
 #ifndef CORENEURON_ENABLE_GPU
@@ -538,6 +539,7 @@ static void bksub_interleaved2(NrnThread* nt,
 #ifndef CORENEURON_ENABLE_GPU
     for (int i = root; i < lastroot; i += 1) {
 #else
+    // OL211207: check if we need an OpenMP directive here.
     nrn_pragma_acc(loop seq)
     for (int i = root; i < lastroot; i += warpsize) {
 #endif
@@ -661,7 +663,7 @@ void solve_interleaved1(int ith) {
                                          lastnode [0:ncell],
                                          cellsize [0:ncell]) if (nt->compute_gpu)
                        async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+    // nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int icell = 0; icell < ncell; ++icell) {
         int icellsize = cellsize[icell];
         triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);

diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp
@@ -211,8 +211,8 @@ static int matsol(SparseObj* so, int _iml) {
     /* Upper triangularization */
     so->numop = 0;
     for (unsigned i = 1; i <= so->neqn; i++) {
-        Elm* pivot;
-        if (fabs((pivot = so->diag[i])->value[_iml]) <= ROUNDOFF) {
+        Elm* pivot{so->diag[i]};
+        if (fabs(pivot->value[_iml]) <= ROUNDOFF) {
             return SINGULAR;
         }
         /* Eliminate all elements in pivot column */

diff --git a/coreneuron/utils/memory_utils.cpp b/coreneuron/utils/memory_utils.cpp
@@ -108,7 +108,8 @@ void report_mem_usage(const char* message, bool all_ranks) {
                mem_avg);
 #ifdef CORENEURON_ENABLE_GPU
         if (corenrn_param.gpu) {
-            print_gpu_memory_usage();
+            // TODO: temporary to avoid CUDA code usage with LLVM build
+            //print_gpu_memory_usage();
         }
 #endif
     }