Skip to content

Commit

Permalink
Added SYCL2020 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
Cielo committed Feb 21, 2023
1 parent bb71dac commit 986c137
Show file tree
Hide file tree
Showing 18 changed files with 232 additions and 151 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.swp
*.swo
/build/
/.ccls-cache/
104 changes: 72 additions & 32 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,59 @@ set(MYBUILDTYPE Release Debug RelWithDebInfo MinSizeRel)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build")
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${MYBUILDTYPE})

# Unfortunately, there is no proper support of SyCL in CMake as of now.
# Unfortunately, there is no proper support of SYCL in CMake as of now.
# Compilation without passing any environment variables was given as a
# requirement, hence, we set the compiler here directly (CMake discourages this).
set(SYCL oneAPI CACHE STRING "Select the SYCL target architecture")
set(MYSYCL oneAPI oneAPIold LLVM hipSYCL)
set(MYSYCL oneAPI oneAPIold LLVM OpenSYCL)
set_property(CACHE SYCL PROPERTY STRINGS ${MYSYCL})
target_compile_definitions(echo PRIVATE SYCL=${SYCL})
# TODO: So far the only target for LLVM is NVIDIA, and for hipSYCL omp/cpu. Add the others!
if (SYCL STREQUAL LLVM)
set(CMAKE_CXX_COMPILER "clang++")
message(STATUS "Configuring Echo with LLVM (NVIDIA) Toolchain")
target_compile_options(echo PUBLIC -fsycl -fsycl-targets=nvptx64-nvidia-cuda)
target_link_options (echo PUBLIC -fsycl -fsycl-targets=nvptx64-nvidia-cuda)
message(STATUS "Set device target to NVIDIA PTX64")
elseif (SYCL STREQUAL hipSYCL )
target_compile_options(echo PUBLIC -fsycl)
target_link_options (echo PUBLIC -fsycl)

# A bit cumbersome to support all!
if (SYCL STREQUAL OpenSYCL )
set(CMAKE_CXX_COMPILER "syclcc-clang")
message(STATUS "Configuring Echo with hipSycl Toolchain")
else()
set(CMAKE_CXX_COMPILER "dpcpp")
message(STATUS "Configuring Echo with LLVM (Intel) Toolchain")
target_compile_options(echo PUBLIC -fsycl -fsycl-unnamed-lambda -xHost)
target_link_options (echo PUBLIC -fsycl)
message(STATUS "Configuring Echo with OpenSycl Toolchain")
elseif (SYCL STREQUAL LLVM)
set(CMAKE_CXX_COMPILER "clang++")
set(SYCL_LLVM_CHAIN Intel CACHE STRING "Intel/NVidia/AMD")
set(MYCHAIN AMD Intel NVidia)
set_property(CACHE SYCL_LLVM_CHAIN PROPERTY STRINGS ${MYCHAIN})
message(STATUS "Configuring Echo with LLVM (${SYCL_LLVM_CHAIN}) Toolchain")
if (SYCL_LLVM_CHAIN STREQUAL AMD)
target_compile_options(echo PUBLIC -fsycl-targets=amdgcn-amd-amdhsa)
target_link_options (echo PUBLIC -fsycl-targets=amdgcn-amd-amdhsa)
set(SYCL_LLVM_ARCH gfx90a CACHE STRING "Run sycl-ls when in doubt.")
set(MYOFFLOAD_ARCH gfx90a gfx908 gfx906)
set_property(CACHE SYCL_LLVM_ARCH PROPERTY STRINGS ${MYOFFLOAD_ARCH})
target_compile_options(echo PUBLIC -Xsycl-target-backend --offload-arch=${SYCL_LLVM_ARCH})
target_link_options (echo PUBLIC -Xsycl-target-backend --offload-arch=${SYCL_LLVM_ARCH})
message(STATUS "Set device target to AMDGCN ${SYCL_LLVM_ARCH}")
elseif (SYCL_LLVM_CHAIN STREQUAL NVidia)
# Atm only AMD needs the arch. specified. Once needed for NVIDIA as well, do it here.
unset(SYCL_LLVM_ARCH CACHE)
target_compile_options(echo PUBLIC -fsycl-targets=nvptx64-nvidia-cuda)
target_link_options (echo PUBLIC -fsycl-targets=nvptx64-nvidia-cuda)
include(CheckLanguage)
check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
target_compile_options(echo PUBLIC --cuda-path=${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../../..)
target_link_options (echo PUBLIC --cuda-path=${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../../..)
#target_compile_options(echo PUBLIC --cuda-path=${CMAKE_CUDA_TOOLKIT_LIBRARY_ROOT})
#target_link_options (echo PUBLIC --cuda-path=${CMAKE_CUDA_TOOLKIT_LIBRARY_ROOT})
message(STATUS "Added explicit CUDA language.")
else ( )
message(WARNING "Explicit CUDA not found. With Codeplay NVidia plugin you should be fine. Otherwise...")
endif ( )
message(STATUS "Set device target to NVIDIA PTX64.")
endif ( )
else ( )
unset(SYCL_LLVM_CHAIN CACHE)
unset(SYCL_LLVM_ARCH CACHE)
set(CMAKE_CXX_COMPILER "icpx")
message(STATUS "Configuring Echo with oneAPI (Intel) Toolchain")
message(STATUS "Set device target to SPIR-V Just-in-Time Compilation")
#-- Stuff specific for oneAPI (for the moment)
option(ENABLE_MPI "Compile with MPI besides DPC++" OFF)
Expand Down Expand Up @@ -96,16 +127,26 @@ else()
endif()

# -- DPC++ and Runtime
option(GPU "Uses device selector rather than host" OFF)
if(GPU)
set(SUFFIX "${SUFFIX}_gpu")
target_compile_definitions(echo PRIVATE GPU)
endif(GPU)
set(SYCL_DEVICE CPU CACHE STRING "Select the primary SYCL device (default/CPU/GPU/etc.). Overridden by forceDevice in echo.par")
set(MYDEVICE DEF CPU GPU ACC FPGA)
set_property(CACHE SYCL_DEVICE PROPERTY STRINGS ${MYDEVICE})
target_compile_definitions(echo PRIVATE DEVICE=DEV_${SYCL_DEVICE})
if(NOT SYCL_DEVICE STREQUAL DEF )
string(TOLOWER ${SYCL_DEVICE} DEVSUFFIX)
set(SUFFIX "${SUFFIX}_${DEVSUFFIX}")
endif()
# TODO: Should be done way better than this! Also a10sx and s10sx were options...
if(SYCL_DEVICE STREQUAL FPGA )
message(WARNING "VERY experimental Intel FPGA workflow (probably not yet supported)!")
target_compile_options(echo PUBLIC -fsycl-link -fintelfpga -Xshardware -Xsboard-package=/glob/development-tools/oneapi/oneapi/intel_s10sx_pac -Xsboard=pac_s10_usm)
target_link_options (echo PUBLIC -fsycl-link -fintelfpga -Xshardware -Xsboard-package=/glob/development-tools/oneapi/oneapi/intel_s10sx_pac -Xsboard=pac_s10_usm)
endif()

# -- Compiling
try_compile(COMPILE_SUCCEEDED
${CMAKE_BINARY_DIR}/check
SOURCES ${CMAKE_SOURCE_DIR}/cmake/checks/fscheck.cpp
CXX_STANDARD 17
${CMAKE_BINARY_DIR}/check
SOURCES ${CMAKE_SOURCE_DIR}/cmake/checks/fscheck.cpp
CXX_STANDARD 17
)
if(COMPILE_SUCCEEDED)
message(STATUS "C++ filesystem API available. Using that.")
Expand All @@ -116,15 +157,14 @@ endif()
CHECK_INCLUDE_FILE_CXX("filesystem.h" FSH_FOUND)
CHECK_INCLUDE_FILE_CXX(filesystem FS_FOUND)


option(ENABLE_MEMORY_SANITATION "Enable memory sanitizer to find and track invalid memory accesses. Currently disabled." OFF)
#if (ENABLE_MEMORY_SANITATION AND NOT GPU)
# message(STATUS "Memory sanitation enabled. Linking appropriate libraries.")
# target_compile_options(echo PUBLIC -fno-omit-frame-pointer -fsanitize=address -fsanitize=undefined -fsanitize-address-use-after-scope -Wuninitialized -g)
# target_link_libraries(echo PUBLIC -fno-omit-frame-pointer -fsanitize=address -fsanitize=undefined -g)
#elseif(ENABLE_MEMORY_SANITATION)
# message(WARNING "Memory sanitation is only available with disabled GPU for now.")
#endif()
if (ENABLE_MEMORY_SANITATION) # AND NOT GPU)
message(STATUS "Memory sanitation enabled. Linking appropriate libraries.")
target_compile_options(echo PUBLIC -fno-omit-frame-pointer -Wuninitialized -g) # -fsanitize=address -fsanitize=undefined -fsanitize-address-use-after-scope
target_link_libraries (echo PUBLIC -fno-omit-frame-pointer -g) # -fsanitize=address -fsanitize=undefined
elseif(ENABLE_MEMORY_SANITATION)
message(WARNING "Memory sanitation is only available with disabled GPU for now.")
endif()

option(SINGLE_PRECISION "Change field from double to float" OFF)
if(SINGLE_PRECISION)
Expand Down
63 changes: 49 additions & 14 deletions DeviceConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,54 @@
#include <sstream>

DeviceConfig::DeviceConfig() {
for (auto const &p : mysycl::platform::get_platforms()) {
for (auto dev : p.get_devices()) {
for (auto const &p : mysycl::platform::get_platforms())
for ( auto dev : p.get_devices() )
devices.push_back(dev);
}
}

device DeviceConfig::deviceWith(int id){
Logger *log = Logger::getInstance(); log->setPar(true);
device temp;
if( (id>=0) && (id<devices.size()) ){
((*log)+2)<<TAG <<"Looking at device #"<<id; log->fl();
temp = devices[id];
}else{
((*log)+2)<<TAG <<"Looking at DPEcho default device."; log->fl();
listDevices();
#if SYCL==oneAPI || SYCL==LLVM
#if DEVICE==DEV_CPU
temp = mysycl::device(mysycl::cpu_selector_v); // mysycl::cpu_selector_v fallBackSel;
#elif DEVICE==DEV_GPU
temp = mysycl::device(mysycl::gpu_selector_v); // mysycl::gpu_selector_v fallBackSel;
#elif DEVICE==DEV_ACC
temp = mysycl::device(mysycl::accelerator_selector_v); // mysycl::accelerator_selector_v fallBackSel;
#elif DEVICE==DEV_FPGA
temp = mysycl::device(mysycl::accelerator_selector_v); // mysycl::accelerator_selector_v fallBackSel;
#else // host is deprecated in SYCL2020
temp = mysycl::device(mysycl::default_selector_v); // mysycl::default_selector_v fallBackSel;
#endif
#else
#if DEVICE==DEV_CPU
mysycl::cpu_selector fallBackSel;
#elif DEVICE==DEV_GPU
mysycl::gpu_selector fallBackSel;
#elif DEVICE==DEV_ACC
mysycl::accelerator_selector fallBackSel;
#elif DEVICE==DEV_FPGA
mysycl::accelerator_selector fallBackSel;
#else // host is deprecated in SYCL2020
mysycl::default_selector fallBackSel;
#endif
temp = mysycl::device(fallBackSel);
#endif
}
printTargetInfo(temp);
return temp;
}

void DeviceConfig::listDevices() {
Logger *log = Logger::getInstance();
log->setPar(true);
((*log) + 2) << TAG <<"\n\t# SYCL devices:\t" << devices.size() ;
Logger *log = Logger::getInstance(); log->setPar(false);
((*log) + 2) << TAG <<"\n\t# Available SYCL devices:\t" << devices.size() ;
for (size_t i = 0; i < devices.size(); i++) {
bool hasDpSupport = devices[i].has(aspect::fp64);
(*log) <<"\n\t- Device #" << i << ":\t"
Expand Down Expand Up @@ -64,19 +101,17 @@ device DeviceConfig::debugDevice() {
throw std::runtime_error("No debug device is available on this machine!");
}

void DeviceConfig::printTargetInfo ( mysycl::queue q) {
Logger *Log = Logger::getInstance(); Log->setPar(false);
auto dev = q.get_device();
void DeviceConfig::printTargetInfo(device dev) {
Logger *Log = Logger::getInstance(); Log->setPar(true);
*Log+0<<TAG
<< "\n\tHardware " << dev.get_info<info::device::name>() << " is " << (dev.is_host()? "HOST ":"")
<< "\n\tHardware " << dev.get_info<info::device::name>() // << " is " << (dev.is_host()? "HOST ":"")
<< (dev.is_cpu()? "CPU ":"") << (dev.is_gpu()? "GPU ":"") << (dev.is_accelerator()? " ACCELERATOR ":"")
<< "\n\tMax Compute Units : " << dev.get_info<info::device::max_compute_units> ();
#if SYCL <= ONEAPI
<< "\n\tMax Compute Units : " << dev.get_info<info::device::max_compute_units> (); Log->fl();
#if SYCL<=oneAPI
*Log+0<< "\n\tMax Work Group Size: " << dev.get_info<info::device::max_work_group_size>()
<< "\n\tGlobal Memory / GB : " << dev.get_info<info::device::global_mem_size> ()/pow(1024.0, 3)
<< "\n\tLocal Memory / kB : " << dev.get_info<info::device::local_mem_size> ()/1024.0 ;
<< "\n\tLocal Memory / kB : " << dev.get_info<info::device::local_mem_size> ()/1024.0 ; Log->fl();
#else
Log->fl();
*Log+18<<"\n\tMax Work Group Size, Global and Local Memory queries are handled differently outside oneAPI.";
#endif
Log->fl();
Expand Down
16 changes: 8 additions & 8 deletions DeviceConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@

struct DeviceConfig {
private:
std::vector<device> devices;
std::vector<device> devices;

public:
DeviceConfig();
void printTargetInfo(mysycl::queue);
void listDevices();
device deviceWith(int id);
device debugDevice();
std::vector<device> gpus();
std::vector<device> cpus();
DeviceConfig();
void printTargetInfo(mysycl::device);
void listDevices();
device deviceWith(int imd);
device debugDevice();
std::vector<device> gpus();
std::vector<device> cpus();
};

#endif
2 changes: 0 additions & 2 deletions Domain.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ class Domain {
private:
Logger *Log;
mysycl::queue qq; field *bufL, *bufR; // Tools for the BCex
// Most of the following could be public const and save us the functions below
// IF we pass values from constructor instead of reading echo.par in there.
int cartDims_[3], cartPeriodic_[3], cartCoords_[3], bcType_[3];
field boxMin_[3], boxMax_[3], boxSize_[3]; // Global info, physical
field locMin_[3], locMax_[3], locSize_[3]; // This rank info, physical
Expand Down
5 changes: 0 additions & 5 deletions Physics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,6 @@ SYCL_EXTERNAL void cons2prim(id<1> myId, unsigned n, field_array u, field_array
//-- Fluxes and characteristic velocities.
// IMPORTANT: all local quantities, they have been sampled --> access simply by eg. f[VX]
void physicalFlux(int dir, Metric &g, field vD[FLD_TOT], field uD[FLD_TOT], field f[FLD_TOT], field vf[2], field vt[2] ){

static const CONSTANT char FMTA[] = "%s: %lf %lf %lf %lf %lf %lf %lf %lf %lf \n";
static const CONSTANT char FMTV[] = "%s: %lf %lf %lf\n";
static const CONSTANT char FMTF[] = "%s: %lf \n";

field alpha = g.alpha(), betai[3], gCov[9], gCon[9];
g.beta(betai);
g.g3DCov(gCov);
Expand Down
12 changes: 6 additions & 6 deletions Problem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@

using namespace std;

Problem::Problem(mysycl::queue qx, Grid *grid, Domain *D, field_array &fld ){
Problem::Problem(mysycl::queue qx, string &confFile, Grid *grid, Domain *D, field_array &fld ){
Log = Logger::getInstance(); grid_ = grid; D_ = D; N_ = grid_->nht;
iOut_ = 0; iStep_ = 0; nStep_ = 0; dumpHalos = false; locSize = 1;
tMax_ = 1.0; dt_ = 0.0; t_ = 0.0, tOut_=0.025, cfl_ = 0.8/3.0; // Divide by 3 as it's 3D
qq = qx;
stepTime_.init();
std::ifstream inFile("echo.par"); std::string key, val;
std::ifstream inFile(confFile); std::string key, val;
Log->setPar(false); *Log+3<<TAG<<"Reading input: ";
while (std::getline(inFile, key, ' ') && std::getline(inFile, val)){
if(!key.compare("tMax" )){ tMax_ = static_cast<field>(stod(val)); *Log<<"\n\ttMax "<<tMax_ ; continue;}
Expand Down Expand Up @@ -63,7 +63,7 @@ void Problem::dump(field_array &v){ // Asynchronous output
} else { // Manual indexing necessary
field *vt[FLD_TOT]; for (int i = 0; i < FLD_TOT; i++) vt[i] = v[i];
field *outt[FLD_TOT]; for (int i = 0; i < FLD_TOT; i++) outt[i] = out[i];
qq.parallel_for(range(gr.n[0], gr.n[1], gr.n[2]), [=](item<3> it) {
qq.parallel_for<class parForDump>(range(gr.n[0], gr.n[1], gr.n[2]), [=](item<3> it) {
auto iOut= it.get_linear_id(); // Output array has NH halo scope here
auto iV = globLinId(it.get_id(), gr.nh, gr.h); // v has WH indexing; offset by halos
for(int iVar=0; iVar<FLD_TOT; ++iVar)
Expand All @@ -83,7 +83,7 @@ void Problem::dump(field_array &v){ // Asynchronous output

void Problem::InitConstWH(field *v, field val) { // HOST CODE: kernel for initialization.
if(!v){ Log->Error("%s Array was not initialized.", TAG); return; }
qq.parallel_for(range<3>(grid_->nh[0], grid_->nh[1], grid_->nh[2]), [=, gr = *(this->grid_)](item<3> it) {
qq.parallel_for<class parForInitConstWH>(range<3>(grid_->nh[0], grid_->nh[1], grid_->nh[2]), [=, gr = *(this->grid_)](item<3> it) {
int offset[3] = {0,0,0};
auto iV = globLinId(it, gr.nh, offset); // v has WH indexing; offset by halos
v[iV] = val;
Expand All @@ -92,7 +92,7 @@ void Problem::InitConstWH(field *v, field val) { // HOST CODE: kernel for initia

void Problem::InitConstNH(field *v, field val) { // HOST CODE: kernel for initialization.
if(!v){ Log->Error("%s Array was not initialized.", TAG); return; }
qq.parallel_for(range<3>(grid_->n[0], grid_->n[1], grid_->n[2]), [=, gr = *(this->grid_)](item<3> it) {
qq.parallel_for<class parForInitConstNH>(range<3>(grid_->n[0], grid_->n[1], grid_->n[2]), [=, gr = *(this->grid_)](item<3> it) {
auto iV = globLinId(it, gr.nh, gr.h); // v has WH indexing; offset by halos
v[iV] = val;
});
Expand Down Expand Up @@ -147,7 +147,7 @@ void Problem::Alfven(field_array &v, field_array &u){ // HOST CODE: Initializing
//-- Device code
field bS[]={D_->boxSize(0), D_->boxSize(1), D_->boxSize(2)};
Grid gr = *grid_; // For ease of lambda capture
qq.parallel_for(range(gr.n[0], gr.n[1], gr.n[2]), [=, NN=N_](item<3> it) {
qq.parallel_for<class parForProblemAlfven>(range(gr.n[0], gr.n[1], gr.n[2]), [=, NN=N_](item<3> it) {
field phi = 0.0, bx, by, bz, vx, vy, vz;
auto i = globLinId(it, gr.nh, gr.h); // Addressing fld: WH indexing

Expand Down
2 changes: 1 addition & 1 deletion Problem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Problem {
bool dumpHalos;
field *out[FLD_TOT]; // Just to print

Problem(mysycl::queue q, Grid *g, Domain *f, field_array &out);
Problem(mysycl::queue q, std::string &confFile, Grid *g, Domain *f, field_array &out);
void InitRampWH (field *);
void InitRampNH (field *);
void InitConstWH(field *, field );
Expand Down
Loading

0 comments on commit 986c137

Please sign in to comment.