Skip to content
This repository has been archived by the owner on Mar 20, 2023. It is now read-only.

Fixes and improvements from LLVM/XLC work. #716

Merged
merged 3 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMake/MakefileBuildOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
set(CORENRN_CXX_FLAGS
"${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}"
)
set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}")

# =============================================================================
# nmodl/mod2c related options : TODO
Expand Down
55 changes: 4 additions & 51 deletions coreneuron/gpu/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,14 @@
#include "coreneuron/mpi/nrnmpidec.h"
#include "coreneuron/utils/utils.hpp"

#ifdef CRAYPAT
#include <pat_api.h>
#endif

#ifdef _OPENACC
#include <openacc.h>
#endif
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
#include <omp.h>
#endif

#ifdef CRAYPAT
#include <pat_api.h>
#endif
namespace coreneuron {
extern InterleaveInfo* interleave_info;
void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
Expand All @@ -43,51 +41,6 @@ void nrn_ion_global_map_delete_from_device();
void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
void nrn_VecPlay_delete_from_device(NrnThread* nt);

template <typename T>
T* cnrn_target_deviceptr(const T* h_ptr) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
#else
throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#pragma omp target enter data map(to:h_ptr[:len])
return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
#else
throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
acc_delete(h_ptr, len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#pragma omp target exit data map(delete: h_ptr[:len])
#else
throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
#else
throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

/* note: threads here are corresponding to global nrn_threads array */
void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
#ifdef _OPENACC
Expand Down
6 changes: 6 additions & 0 deletions coreneuron/kinderiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def write_out_kinderiv(fout):
fout.write("\n/* declarations */\n")
fout.write("\nnamespace coreneuron {\n")

if deriv or kin or euler:
fout.write('nrn_pragma_omp(declare target)\n')

for item in deriv:
fout.write('#pragma acc routine seq\n')
fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
Expand All @@ -73,6 +76,9 @@ def write_out_kinderiv(fout):
fout.write('#pragma acc routine seq\n')
fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))

if deriv or kin or euler:
fout.write('nrn_pragma_omp(end declare target)\n')

fout.write("\n/* callback indices */\n")
derivoffset = 1
kinoffset = 1
Expand Down
2 changes: 2 additions & 0 deletions coreneuron/mechanism/eion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
}
}

nrn_pragma_omp(declare target)
void nrn_wrote_conc(int type,
double* p1,
int p2,
Expand All @@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,
pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
}
}
nrn_pragma_omp(end declare target)

static double efun(double x) {
if (fabs(x) < 1e-4) {
Expand Down
2 changes: 2 additions & 0 deletions coreneuron/mechanism/mech/dimplic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
#include "_kinderiv.h"
namespace coreneuron {
nrn_pragma_omp(declare target)
int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
difun(fun);
return 0;
Expand All @@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
switch (fun) { _NRN_KINETIC_CASES }
return 0;
}
nrn_pragma_omp(end declare target)

} // namespace coreneuron
2 changes: 2 additions & 0 deletions coreneuron/mechanism/register_mech.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

namespace coreneuron {
int secondorder = 0;
nrn_pragma_omp(declare target)
double t, dt, celsius, pi;
nrn_pragma_omp(end declare target)
int rev_dt;

using Pfrv = void (*)();
Expand Down
2 changes: 2 additions & 0 deletions coreneuron/network/cvodestb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {

// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
// for the ISPC backend. If changes are required, make sure to change ISPC as well.
nrn_pragma_omp(declare target)
int at_time(NrnThread* nt, double te) {
double x = te - 1e-11;
if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
return 1;
}
return 0;
}
nrn_pragma_omp(end declare target)

} // namespace coreneuron
2 changes: 1 addition & 1 deletion coreneuron/network/netcvode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
nrn_pragma_acc(parallel loop present(
nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
for (int i = 0; i < nt->ncell; ++i) {
PreSyn* ps = presyns + i;
PreSynHelper* psh = presyns_helper + i;
Expand Down
2 changes: 2 additions & 0 deletions coreneuron/sim/scopmath/crout_thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ namespace coreneuron {
#define ix(arg) ((arg) *_STRIDE)

/* having a differnt permutation per instance may not be a good idea */
nrn_pragma_omp(declare target)
int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
int save_i = 0;

Expand Down Expand Up @@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,
}
}
}
nrn_pragma_omp(end declare target)
} // namespace coreneuron
2 changes: 2 additions & 0 deletions coreneuron/sim/scopmath/newton_thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ namespace coreneuron {
#define ix(arg) ((arg) *_STRIDE)
#define s_(arg) _p[s[arg] * _STRIDE]

nrn_pragma_omp(declare target)
int nrn_newton_thread(NewtonSpace* ns,
int n,
int* s,
Expand Down Expand Up @@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns,

return (error);
}
nrn_pragma_omp(end declare target)

/*------------------------------------------------------------*/
/* */
Expand Down
12 changes: 6 additions & 6 deletions coreneuron/sim/treeset_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) {

nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i1; i < i3; ++i) {
vec_rhs[i] = 0.;
vec_d[i] = 0.;
Expand All @@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) {
nrn_pragma_acc(
parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i1; i < i3; ++i) {
fast_imem_d[i] = 0.;
fast_imem_rhs[i] = 0.;
Expand Down Expand Up @@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {
double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i1; i < i3; ++i) {
p[i] -= vec_rhs[i];
}
Expand All @@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) {
vec_v [0:i3],
parent_index [0:i3]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i2; i < i3; ++i) {
double dv = vec_v[parent_index[i]] - vec_v[i];
/* our connection coefficients are negative so */
Expand Down Expand Up @@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) {
*/
double* p = _nt->nrn_fast_imem->nrn_sav_d;
nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i1; i < i3; ++i) {
p[i] += vec_d[i];
}
Expand All @@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) {
nrn_pragma_acc(parallel loop present(
vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
for (int i = i2; i < i3; ++i) {
nrn_pragma_acc(atomic update)
nrn_pragma_omp(atomic update)
Expand Down
53 changes: 53 additions & 0 deletions coreneuron/utils/offload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,64 @@
#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#define nrn_pragma_acc(x)
#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
#include <omp.h>
#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
defined(_OPENACC)
#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
#define nrn_pragma_omp(x)
#include <openacc.h>
#else
#define nrn_pragma_acc(x)
#define nrn_pragma_omp(x)
#include <stdexcept>
#endif

#include <cstddef>

namespace coreneuron {
template <typename T>
T* cnrn_target_deviceptr(const T* h_ptr) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
#else
throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#pragma omp target enter data map(to:h_ptr[:len])
return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
#else
throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
acc_delete(h_ptr, len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#pragma omp target exit data map(delete: h_ptr[:len])
#else
throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
#else
throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

}
4 changes: 2 additions & 2 deletions extra/nrnivmodl_core_makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ endif

CXXFLAGS = @CORENRN_CXX_FLAGS@
CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@

# ISPC compilation and link commands
ISPC = @CMAKE_ISPC_COMPILER@
Expand Down