Skip to content

Commit

Permalink
Merge branch 'feature/sw'
Browse files Browse the repository at this point in the history
  • Loading branch information
FlorianDeconinck committed Apr 22, 2024
2 parents 07ee7b0 + 30f8c06 commit a95473c
Show file tree
Hide file tree
Showing 16 changed files with 271 additions and 3 deletions.
102 changes: 102 additions & 0 deletions src/tcn/ci/pipeline/templates/gpu-mps-launcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/sh

# For debug of this script
#set -x

HOSTNAME=`hostname`
if [ ${OMPI_COMM_WORLD_LOCAL_RANK:0} ]; then
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
elif [ ${SLURM_LOCALID:0} ]; then
LOCAL_RANK=$SLURM_LOCALID
else
if [ $LOCAL_RANK -eq 0 ]; then
echo "Unimplemented MPI environement, can't read local rank. Exiting."
fi
exit 1
fi

# Hardware sampling is a python tools that reads at intervals
# various hardware sensors (power, usage, memory load...)
if [ -z ${HARDWARE_SAMPLING} ]; then
if [ $LOCAL_RANK -eq 0 ]; then
echo "Hardware sampling is OFF"
fi
else
if [ $LOCAL_RANK -eq 0 ]; then
echo "Hardware sampling is ON"
fi
# We restrict usage to (world) rank 0
if [ $SLURM_PROCID -eq 0 ]; then
geosongpu_hws server &
sleep 10
geosongpu_hws client start
fi
fi

# Nvidia's Multi Process Service required to run multiple processed
# at the same time on one GPU

# We open GPU visibility to full node at first
export CUDA_VISIBLE_DEVICES=0,1,2,3

if [ -z ${MPS_ON} ]; then
if [ $LOCAL_RANK -eq 0 ]; then
echo "MPS is OFF"
fi
# No MPS, we assume rank==GPU
GPU=$LOCAL_RANK
export CUDA_VISIBLE_DEVICES=$GPU
else
if [ $LOCAL_RANK -eq 0 ]; then
echo "MPS is ON"
fi
if [ -z ${PER_DEVICE_PROCESS} ]; then
if [ $LOCAL_RANK -eq 0 ]; then
echo "PER_DEVICE_PROCESS needs to be setup on MPS. Exiting."
fi
exit 1
fi
# All ranks needs to know where to look
export CUDA_MPS_PIPE_DIRECTORY=./nvidia-mps/$HOSTNAME
export CUDA_MPS_LOG_DIRECTORY=./nvidia-log/$HOSTNAME
# Only 1 rank per node (local rank 0) handles the server chatter
if [ $LOCAL_RANK -eq 0 ]; then
echo "Turn nvidia-cuda-mps-control on for node $HOSTNAME"
mkdir -p $CUDA_MPS_PIPE_DIRECTORY
mkdir -p $CUDA_MPS_LOG_DIRECTORY
# sudo nividia -i 0 -c 3 # Per docs, we should insure GPU is in EXCLUSIVE mode but we might be curtail by HPC settings
nvidia-cuda-mps-control -d
fi
# MPS server is socket base, leave time for the filesystem
sleep 10
# Server should be spun, we restrict this rank to a single GPU
GPU=$((LOCAL_RANK/PER_DEVICE_PROCESS))
export CUDA_VISIBLE_DEVICES=$GPU
fi

echo "Node: $HOSTNAME | Rank: $LOCAL_RANK, pinned to GPU: $CUDA_VISIBLE_DEVICES"

# Run program with or without log dump in file
if [ -z ${LOCAL_REDIRECT_LOG} ]; then
$*
else
$* > log.redirect_local.$HOSTNAME.$LOCAL_RANK.out 2>&1
fi

# Clean up of all tools
if [ -z ${HARDWARE_SAMPLING} ]; then
echo ""
else
if [ $LOCAL_RANK -eq 0 ]; then
geosongpu_hws client dump
geosongpu_hws client stop
fi
fi
if [ -z ${MPS_ON} ]; then
echo ""
else
if [ $LOCAL_RANK -eq 0 ]; then
echo quit | nvidia-cuda-mps-control
# sudo nividia -i 0 -c 0 # Per docs, we should insure GPU is flipped back to DEFAULT mode but we might be curtail by HPC settings
fi
fi
10 changes: 10 additions & 0 deletions src/tcn/ci/pipeline/templates/ompi-wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Turn MPS on
export MPS_ON=1

# Read `np`
NP="$2"
shift
shift

# Forward to the launcher
mpirun -np $NP ./gpu-mps-launcher.sh $*
32 changes: 32 additions & 0 deletions src/tcn/py_ftn_interface/example/bridge.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module stub_interface_mod

use iso_c_binding, only: c_int, c_float, c_double, c_bool

implicit none

private
public :: python_function_f, data_t

type, bind(c) :: data_t
real(c_float) :: x
integer(c_int) :: y
logical(c_bool) :: b
! Magic number: help guaranteeing layout is kept
! consistant through the interface. Imperfect.
integer(c_int) :: i_am_123456789 = 123456789
end type

interface

subroutine python_function_f(data, value) bind(c, name='python_function')
import data_t, c_int

implicit none
type(data_t), intent(in) :: data
integer(kind=c_int), intent(in) :: value

end subroutine python_function_f

end interface

end module stub_interface_mod
28 changes: 28 additions & 0 deletions src/tcn/py_ftn_interface/example/bridge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# file plugin_build.py
import cffi

ffibuilder = cffi.FFI()

with open("data_to_be_transited.h") as f:
data = "".join([line for line in f if not line.startswith("#")])
data = data.replace("CFFI_DLLEXPORT", "")
ffibuilder.embedding_api(data)

ffibuilder.set_source(
"bridge",
r"""#include "data_to_be_transited.h" """,
)

ffibuilder.embedding_init_code(
"""
from bridge import ffi
from runtime_code import check_function
@ffi.def_extern()
def python_function(data:"data_t", union_v: "union_t"):
check_function(data)
"""
)

ffibuilder.compile(target="bridge.so", verbose=True)
5 changes: 5 additions & 0 deletions src/tcn/py_ftn_interface/example/build_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

python bridge.py
gfortran bridge.f90 main.f90 -o test ./bridge.so
PYTHONPATH=. ./test
3 changes: 3 additions & 0 deletions src/tcn/py_ftn_interface/example/clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

rm -f bridge.so bridge.o stub_interface_mod.mod bridge.c test
10 changes: 10 additions & 0 deletions src/tcn/py_ftn_interface/example/data_desc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass


@dataclass
class Data_py_t:
x: float
y: int
b: bool
# Magic number: see Fortran
i_am_123456789: int
19 changes: 19 additions & 0 deletions src/tcn/py_ftn_interface/example/data_to_be_transited.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once
#include <stdbool.h>

typedef struct
{
float x;
int y;
bool b;
// Magic number, see Fortran
int i_am_123456789;
} data_t;

typedef union
{
void *void_ptr;
int int_value;
} union_t;

extern void python_function(data_t *, union_t *);
12 changes: 12 additions & 0 deletions src/tcn/py_ftn_interface/example/main.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
program test
use stub_interface_mod, only: python_function_f, data_t

implicit none

type(data_t) :: d
d = data_t(42.42, 24, .true.)
call python_function_f(d, 39)

print *, 'test'
end program test

16 changes: 16 additions & 0 deletions src/tcn/py_ftn_interface/example/runtime_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from data_desc import Data_py_t
import inspect


def check_function(data: Data_py_t):
# Check the magic number
if data.i_am_123456789 != 123456789:
raise ValueError("Magic number failure")

print(f"Data comes as {data} of type {type(data)}")
members = inspect.getmembers(Data_py_t)
keys = list(
list(filter(lambda x: x[0] == "__dataclass_fields__", members))[0][1].values()
)
for k in keys:
print(f"{k.name} of value {getattr(data, k.name)}")
9 changes: 8 additions & 1 deletion sw_stack/discover/sles15/HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ All versions of the software for a given version are saved in `basics.sh`.
`build` directory is the throwaway directory where everything is downloaded then built.
`install` is saves all library and executable once build is done.

Last edit: _March 18th 2024_
Last edit: _March 22th 2024_

## v2024.03.00

Expand Down Expand Up @@ -37,6 +37,13 @@ When defining `BUILD_GCC_OFFLOAD`:

Test of the stack can be done via the `osu-microbenchmark` with latency & bandwith saved in `osu-bench.sh`.

### Python stack

Required `ndsl` dependency plus:

- cffi: latest (for fortran<>python bridge)
- cupy-cuda12x: latest

_Note:_

- [^1]: `gcc-13.2.0` fails during GEOS with an internal compiler error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ load("nvidia/nvhpc-byo-compiler/23.9")

local install_dir = "/discover/nobackup/projects/geosongpu/sw_sles15/live/src/2024.03.00/install"

-- Fix: GT4Py expects CUDA_HOME to be set --
setenv("CUDA_HOME", os.getenv("NVHPC_ROOT"))

-- UCX --
local ucx_pkgdir = pathJoin(install_dir, "ucx")
prepend_path("LD_LIBRARY_PATH",pathJoin(ucx_pkgdir,"lib"))
Expand Down Expand Up @@ -35,3 +38,7 @@ local py_pkgdir = pathJoin(install_dir, "python3")
prepend_path("PATH",pathJoin(py_pkgdir,"bin"))
prepend_path("LD_LIBRARY_PATH",pathJoin(py_pkgdir,"lib"))
prepend_path("LD_LIBRARY_PATH",pathJoin(py_pkgdir,"lib64"))

-- Baselibs at a BASEDIR --
local baselibs_pkgdir = pathJoin(install_dir, "baselibs-7.17.1/install/x86_64-pc-linux-gnu/")
setenv("BASEDIR", baselibs_pkgdir)
7 changes: 7 additions & 0 deletions sw_stack/discover/sles15/modulefiles/SMTStack/2024.03.00.lua
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ load("nvidia/nvhpc-byo-compiler/23.9")

local install_dir = "/discover/nobackup/projects/geosongpu/sw_sles15/live/src/2024.03.00/install"

-- Fix: GT4Py expects CUDA_HOME to be set --
setenv("CUDA_HOME", os.getenv("NVHPC_ROOT"))

-- UCX --
local ucx_pkgdir = pathJoin(install_dir, "ucx")
prepend_path("LD_LIBRARY_PATH",pathJoin(ucx_pkgdir,"lib"))
Expand Down Expand Up @@ -40,3 +43,7 @@ prepend_path("LD_LIBRARY_PATH",pathJoin(py_pkgdir,"lib64"))
local py_pkgdir = pathJoin(install_dir, "venv")
prepend_path("PATH",pathJoin(py_pkgdir,"bin"))
setenv("VIRTUAL_ENV", py_pkgdir)

-- Baselibs at a BASEDIR --
local baselibs_pkgdir = pathJoin(install_dir, "baselibs-7.17.1/install/x86_64-pc-linux-gnu/")
setenv("BASEDIR", baselibs_pkgdir)
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ source ./basics.sh

echo " === Make NDSL venv === "
cd $DSLSW_INSTALL_DIR
./python3/bin/python3 -m venv venv
python3 -m venv venv
source ./venv/bin/activate
pip install --upgrade setuptools pip
pip install -e $DSLSW_INSTALL_DIR/ndsl
pip install mpi4py cffi
pip install mpi4py cffi cupy-cuda12x
10 changes: 10 additions & 0 deletions sw_stack/discover/sles15/src/2024.03.00/verify_baselibs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

source ./basics.sh

cd $DSLSW_BASE/baselibs-$DSLSW_BASELIBS_VER
make ESMF_COMM=openmpi \
BUILD=ESSENTIALS \
ALLOW_ARGUMENT_MISMATCH=-fallow-argument-mismatch \
prefix=$DSLSW_INSTALL_DIR/baselibs-$DSLSW_BASELIBS_VER/install/x86_64-pc-linux-gnu/Linux \
verify

0 comments on commit a95473c

Please sign in to comment.