Skip to content
This repository has been archived by the owner on Mar 20, 2023. It is now read-only.

Fixes for building with LLVM / XL OpenMP offload #706

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMake/MakefileBuildOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ list(REMOVE_ITEM CORENRN_LINK_LIBS "Threads::Threads")

# replicate CMake magic to transform system libs to -l<libname>
foreach(link_lib ${CORENRN_LINK_LIBS})
if(TARGET ${link_lib})
get_property(link_flag TARGET ${link_lib} PROPERTY INTERFACE_LINK_LIBRARIES)
string(APPEND CORENRN_COMMON_LDFLAGS " ${link_flag}")
continue()
endif()
if(${link_lib} MATCHES "\-l.*")
string(APPEND CORENRN_COMMON_LDFLAGS " ${link_lib}")
continue()
Expand Down
81 changes: 52 additions & 29 deletions CMake/OpenAccHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,45 +42,68 @@ if(CORENRN_ENABLE_GPU)
# ${CMAKE_CUDA_COMPILER_VERSION}
find_package(CUDAToolkit 9.0 REQUIRED)
# Be a bit paranoid
if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION})
message(
FATAL_ERROR
"CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!"
)
endif()
# if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION})
# message(
# FATAL_ERROR
# "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!"
# )
# endif()
set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
endif()
# -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
# linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
# and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
# some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
# code is compiled with the same CUDA version as the explicit CUDA code.
set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
# same default compute capabilities as each other, particularly on GPU-less build machines.
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
endforeach()
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
# Only enable OpenACC offload for GPU
string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
if(CORENRN_HAVE_NVHPC_COMPILER)
# -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
# linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
# and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
# some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
# code is compiled with the same CUDA version as the explicit CUDA code.
set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
# same default compute capabilities as each other, particularly on GPU-less build machines.
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
endforeach()
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
# Only enable OpenACC offload for GPU
string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
else()
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
endif()
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "XLClang")
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
set(NVHPC_ACC_COMP_FLAGS "-qsmp=omp -qoffload -qreport")
set(NVHPC_ACC_LINK_FLAGS "-qcuda -lcaliper")
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
else()
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with XLClang compilers")
endif()
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
else()
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with Clang compilers")
endif()
else()
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
message(FATAL_ERROR "${CMAKE_CXX_COMPILER_ID} is not supported in GPU builds.")
endif()
# avoid PGI adding standard compliant "-A" flags
set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
# set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
# Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
# especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
# Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
# for OpenACC code generation.
set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
set(NVHPC_CXX_INLINE_FLAGS)
endif()

# =============================================================================
Expand Down
23 changes: 13 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,13 @@ if(CORENRN_ENABLE_GPU)
# CUDA_RESOLVE_DEVICE_SYMBOLS OFF)

# Fail hard and early if we don't have the PGI/NVHPC compiler.
if(NOT CORENRN_HAVE_NVHPC_COMPILER)
message(
FATAL_ERROR
"GPU support is available via OpenACC using PGI/NVIDIA compilers."
" Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++"
)
endif()
# if(NOT CORENRN_HAVE_NVHPC_COMPILER)
# message(
# FATAL_ERROR
# "GPU support is available via OpenACC using PGI/NVIDIA compilers."
# " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++"
# )
# endif()

# Set some sensible default CUDA architectures.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
Expand Down Expand Up @@ -184,9 +184,9 @@ if(CORENRN_ENABLE_GPU)
# 3057 : Warning on ignoring __host__ annotation in some functions
# 3085 : Warning on redeclaring a __host__ function as __host__ __device__
# ~~~
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
)
# set(CMAKE_CUDA_FLAGS
# "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
# )

if(CORENRN_ENABLE_NMODL)
# NMODL supports both OpenACC and OpenMP target offload
Expand Down Expand Up @@ -419,6 +419,9 @@ endif()
# =============================================================================
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")

# Set coreneuron specific linker flag after NMODL submodule is added
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")

# =============================================================================
# Add main directories
# =============================================================================
Expand Down
13 changes: 9 additions & 4 deletions coreneuron/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ file(
"utils/*/*.c"
"utils/*/*.cpp")
file(GLOB_RECURSE CORENEURON_CUDA_FILES "*.cu")
list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu")
list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/profile/cuda_profile.cu")
list(REMOVE_ITEM CORENEURON_CUDA_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu")
set(SCOPMATH_CODE_FILES
"sim/scopmath/abort.cpp" "sim/scopmath/crout_thread.cpp" "sim/scopmath/newton_thread.cpp"
"sim/scopmath/sparse_thread.cpp" "sim/scopmath/ssimplic_thread.cpp")
Expand Down Expand Up @@ -129,16 +132,16 @@ if(CORENRN_ENABLE_GPU)
# CMake <v3.20 does not pass explicit -x <lang> options based on the LANGUAGE property
# (https://cmake.org/cmake/help/latest/policy/CMP0119.html), so using a single .cu file and
# setting LANGUAGE=CXX in non-GPU builds does not work.
list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
#list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
list(APPEND CORENEURON_CODE_FILES ${CORENEURON_CUDA_FILES})

# Eigen-3.5+ provides better GPU support. However, some functions cannot be called directly from
# within an OpenACC region. Therefore, we need to wrap them in a special API (decorate them with
# __device__ & acc routine tokens), which allows us to eventually call them from OpenACC. Calling
# these functions from CUDA kernels presents no issue ...
if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
endif()
# if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
# list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cu)
# endif()
endif()

# =============================================================================
Expand Down Expand Up @@ -198,6 +201,8 @@ if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
endif()

target_link_libraries(coreneuron CUDA::cudart)

# this is where we handle dynamic mpi library build
if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
# ~~~
Expand Down
4 changes: 3 additions & 1 deletion coreneuron/permute/cellorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
bool has_subtrees_to_compute = true;

// clang-format off
// OL211207: check if we need an OpenMP directive here.
nrn_pragma_acc(loop seq)
for (; has_subtrees_to_compute; ) { // ncycle loop
#ifndef CORENEURON_ENABLE_GPU
Expand Down Expand Up @@ -538,6 +539,7 @@ static void bksub_interleaved2(NrnThread* nt,
#ifndef CORENEURON_ENABLE_GPU
for (int i = root; i < lastroot; i += 1) {
#else
// OL211207: check if we need an OpenMP directive here.
nrn_pragma_acc(loop seq)
for (int i = root; i < lastroot; i += warpsize) {
#endif
Expand Down Expand Up @@ -661,7 +663,7 @@ void solve_interleaved1(int ith) {
lastnode [0:ncell],
cellsize [0:ncell]) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
// nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
for (int icell = 0; icell < ncell; ++icell) {
int icellsize = cellsize[icell];
triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
Expand Down
4 changes: 2 additions & 2 deletions coreneuron/sim/scopmath/sparse_thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ static int matsol(SparseObj* so, int _iml) {
/* Upper triangularization */
so->numop = 0;
for (unsigned i = 1; i <= so->neqn; i++) {
Elm* pivot;
if (fabs((pivot = so->diag[i])->value[_iml]) <= ROUNDOFF) {
Elm* pivot{so->diag[i]};
if (fabs(pivot->value[_iml]) <= ROUNDOFF) {
return SINGULAR;
}
/* Eliminate all elements in pivot column */
Expand Down
3 changes: 2 additions & 1 deletion coreneuron/utils/memory_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ void report_mem_usage(const char* message, bool all_ranks) {
mem_avg);
#ifdef CORENEURON_ENABLE_GPU
if (corenrn_param.gpu) {
print_gpu_memory_usage();
// TODO: temporary to avoid CUDA code usage with LLVM build
//print_gpu_memory_usage();
}
#endif
}
Expand Down