diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake index fc0b0b551..0ebc7bea9 100644 --- a/CMake/MakefileBuildOptions.cmake +++ b/CMake/MakefileBuildOptions.cmake @@ -50,6 +50,11 @@ list(REMOVE_ITEM CORENRN_LINK_LIBS "Threads::Threads") # replicate CMake magic to transform system libs to -l foreach(link_lib ${CORENRN_LINK_LIBS}) + if(TARGET ${link_lib}) + get_property(link_flag TARGET ${link_lib} PROPERTY INTERFACE_LINK_LIBRARIES) + string(APPEND CORENRN_COMMON_LDFLAGS " ${link_flag}") + continue() + endif() if(${link_lib} MATCHES "\-l.*") string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}") continue() diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 5838742f8..8f14fb754 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -42,45 +42,68 @@ if(CORENRN_ENABLE_GPU) # ${CMAKE_CUDA_COMPILER_VERSION} find_package(CUDAToolkit 9.0 REQUIRED) # Be a bit paranoid - if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION}) - message( - FATAL_ERROR - "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!" - ) - endif() + # if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION}) + # message( + # FATAL_ERROR + # "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!" + # ) + # endif() set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}") endif() - # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code - # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code - # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in - # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP - # code is compiled with the same CUDA version as the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") - # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA - # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the - # same default compute capabilities as each other, particularly on GPU-less build machines. - foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) - string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") - endforeach() - if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") - # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available - # for a region then prefer OpenMP. - add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) - string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu") - elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC") - # Only enable OpenACC offload for GPU - string(APPEND NVHPC_ACC_COMP_FLAGS " -acc") + if(CORENRN_HAVE_NVHPC_COMPILER) + # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code + # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code + # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in + # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP + # code is compiled with the same CUDA version as the explicit CUDA code. + set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") + # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA + # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the + # same default compute capabilities as each other, particularly on GPU-less build machines. + foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) + string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") + endforeach() + if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu") + elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC") + # Only enable OpenACC offload for GPU + string(APPEND NVHPC_ACC_COMP_FLAGS " -acc") + else() + message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers") + endif() + elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "XLClang") + if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") + set(NVHPC_ACC_COMP_FLAGS "-qsmp=omp -qoffload -qreport") + set(NVHPC_ACC_LINK_FLAGS "-qcuda -lcaliper") + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + else() + message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with XLClang compilers") + endif() + elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") + set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}") + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + else() + message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with Clang compilers") + endif() else() - message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers") + message(FATAL_ERROR "${CMAKE_CXX_COMPILER_ID} is not supported in GPU builds.") endif() # avoid PGI adding standard compliant "-A" flags - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) - string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}") + # set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors. # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined # for OpenACC code generation. set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline") + set(NVHPC_CXX_INLINE_FLAGS) endif() # ============================================================================= diff --git a/CMakeLists.txt b/CMakeLists.txt index d3e1950d0..4ceb1d147 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,13 +127,13 @@ if(CORENRN_ENABLE_GPU) # CUDA_RESOLVE_DEVICE_SYMBOLS OFF) # Fail hard and early if we don't have the PGI/NVHPC compiler. - if(NOT CORENRN_HAVE_NVHPC_COMPILER) - message( - FATAL_ERROR - "GPU support is available via OpenACC using PGI/NVIDIA compilers." - " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++" - ) - endif() + # if(NOT CORENRN_HAVE_NVHPC_COMPILER) + # message( + # FATAL_ERROR + # "GPU support is available via OpenACC using PGI/NVIDIA compilers." + # " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++" + # ) + # endif() # Set some sensible default CUDA architectures. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) @@ -184,9 +184,9 @@ if(CORENRN_ENABLE_GPU) # 3057 : Warning on ignoring __host__ annotation in some functions # 3085 : Warning on redeclaring a __host__ function as __host__ __device__ # ~~~ - set(CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085" - ) + # set(CMAKE_CUDA_FLAGS + # "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085" + # ) if(CORENRN_ENABLE_NMODL) # NMODL supports both OpenACC and OpenMP target offload @@ -419,6 +419,9 @@ endif() # ============================================================================= set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}") +# Set coreneuron specific linker flag after NMODL submodule is added +string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}") + # ============================================================================= # Add main directories # ============================================================================= diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index d370df1df..c9af89553 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -32,6 +32,9 @@ file( "utils/*/*.c" "utils/*/*.cpp") file(GLOB_RECURSE CORENEURON_CUDA_FILES "*.cu") +list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu") +list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/profile/cuda_profile.cu") +list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu") set(SCOPMATH_CODE_FILES "sim/scopmath/abort.cpp" "sim/scopmath/crout_thread.cpp" "sim/scopmath/newton_thread.cpp" "sim/scopmath/sparse_thread.cpp" "sim/scopmath/ssimplic_thread.cpp") @@ -129,16 +132,16 @@ if(CORENRN_ENABLE_GPU) # CMake options based on the LANGUAGE property # (https://cmake.org/cmake/help/latest/policy/CMP0119.html), so using a single .cu file and # setting LANGUAGE=CXX in non-GPU builds does not work. - list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp") + #list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp") list(APPEND CORENEURON_CODE_FILES ${CORENEURON_CUDA_FILES}) # Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from # within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with # __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling # these functions from CUDA kernels presents no issue ... - if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu) - list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu) - endif() + # if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu) + # list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu) + # endif() endif() # ============================================================================= @@ -198,6 +201,8 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC) target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES}) endif() +target_link_libraries(coreneuron CUDA::cudart) + # this is where we handle dynamic mpi library build if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC) # ~~~ diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 14feb31de..d9e559616 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -490,6 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid bool has_subtrees_to_compute = true; // clang-format off + // OL211207: check if we need an OpenMP directive here. nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop #ifndef CORENEURON_ENABLE_GPU @@ -538,6 +539,7 @@ static void bksub_interleaved2(NrnThread* nt, #ifndef CORENEURON_ENABLE_GPU for (int i = root; i < lastroot; i += 1) { #else + // OL211207: check if we need an OpenMP directive here. nrn_pragma_acc(loop seq) for (int i = root; i < lastroot; i += warpsize) { #endif @@ -661,7 +663,7 @@ void solve_interleaved1(int ith) { lastnode [0:ncell], cellsize [0:ncell]) if (nt->compute_gpu) async(nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + // nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp index 71643430a..70da9aa21 100644 --- a/coreneuron/sim/scopmath/sparse_thread.cpp +++ b/coreneuron/sim/scopmath/sparse_thread.cpp @@ -211,8 +211,8 @@ static int matsol(SparseObj* so, int _iml) { /* Upper triangularization */ so->numop = 0; for (unsigned i = 1; i <= so->neqn; i++) { - Elm* pivot; - if (fabs((pivot = so->diag[i])->value[_iml]) <= ROUNDOFF) { + Elm* pivot{so->diag[i]}; + if (fabs(pivot->value[_iml]) <= ROUNDOFF) { return SINGULAR; } /* Eliminate all elements in pivot column */ diff --git a/coreneuron/utils/memory_utils.cpp b/coreneuron/utils/memory_utils.cpp index e066e1627..da0ef9455 100644 --- a/coreneuron/utils/memory_utils.cpp +++ b/coreneuron/utils/memory_utils.cpp @@ -108,7 +108,8 @@ void report_mem_usage(const char* message, bool all_ranks) { mem_avg); #ifdef CORENEURON_ENABLE_GPU if (corenrn_param.gpu) { - print_gpu_memory_usage(); + // TODO: temporary to avoid CUDA code usage with LLVM build + //print_gpu_memory_usage(); } #endif }