generated from GEOS-ESM/geos-template-repo
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Software stack init: sles15 + discover
- Loading branch information
1 parent
1f51399
commit df8bbac
Showing
7 changed files
with
231 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# GPU-accelerated software stack for GEOS | ||
|
||
## Methodology | ||
|
||
All versions of the software for a given version are saved in `basics.sh`. | ||
`build` directory is the throwaway directory where everything is downloaded then built. | ||
`install` is saves all library and executable once build is done. | ||
|
||
Last edit: _December 29th 2023_ | ||
|
||
## v1.0.0 | ||
|
||
### OpenMPI | ||
|
||
We build OpenMPI throught the UCX layer with cuda-enabled and GRDCopy and GPUDirect on (linus kernels are check via `check_gpu_comms`). | ||
|
||
- GDRCOPY: 2.3 | ||
- GCC: 12.3.0 [^1] (via `comp/gcc/12.3.0` on discover) | ||
- CUDA (via `nvhpc`): 12.2 [^2] (via `nvidia/nvhpc-nompi/23.9` on discover) | ||
- UCX: 1.15.0 | ||
- OpenMPI: 4.1.6 [^3] | ||
- OSU-MICROBENCHMARK: 7.2 | ||
|
||
Test of the stack can be done via the `osu-microbenchmark` with latency & bandwith saved in `osu-bench.sh`. | ||
|
||
_Note:_ | ||
|
||
- [^1]: `gcc-13.2.0` fails during GEOS with an internal compiler error | ||
- [^2]: `nvhpc` ships with a prebuilt `openmpi` which can cause issues. Make sure to load the `nompi` module. | ||
- [^3]: `openmpi-5.0.0` fails at GEOS runtime on a call to `libxml2` that does a divide by zero (triggering a sigfpe). We revert to `4.1.6`. | ||
|
||
### Baselibs | ||
|
||
- LAPACK/BLAS: 3.11.0 | ||
- BASELIBS: 7.14.1 | ||
|
||
### Python | ||
|
||
- Python: 3.8.10 [^4] | ||
|
||
### Serialbox | ||
|
||
- Latest stable is 2.6.1. Development is over. | ||
|
||
_Note:_ | ||
|
||
- [^4]: `3.10.12` leads to failure in DaCe parsing. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/bin/bash | ||
|
||
# Version | ||
export DSLSW_GDRCOPY_VER=2.3 | ||
export DSLSW_OMPI_MAJOR_VER=4.1 | ||
export DSLSW_OMPI_VER=${DSLSW_OMPI_MAJOR_VER}.6 | ||
export DSLSW_UCX_VER=1.15.0 | ||
export DSLSW_CUDA_VER=12.2 | ||
export DSLSW_OSUMICRO_VER=7.2 | ||
export DSLSW_LAPACK_VER=3.11.0 | ||
export DSLSW_PY_VER=3.8.10 | ||
export DSLSW_BASELIBS_VER=7.14.1 | ||
export DSLSW_SERIALBOX_VER=2.6.1 | ||
|
||
# Base directory & versioning | ||
export DSLSW_BASE=$PWD/build | ||
mkdir -p $DSLSW_BASE | ||
export DSLSW_INSTALL_DIR=$PWD/install | ||
|
||
# Modules | ||
module load nvidia/nvhpc-nompi/23.9 | ||
CUDA_DIR=/usr/local/other/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/ | ||
module load comp/gcc/12.3.0 | ||
module load other/boost/1.77.0 | ||
|
||
# Enforce proper compilers | ||
export FC=gfortran | ||
export CC=gcc | ||
export CXX=g++ | ||
|
||
export LD_LIBRARY_PATH=$DSLSW_INSTALL_DIR/ompi/lib:$DSLSW_INSTALL_DIR/ucx/lib:$DSLSW_INSTALL_DIR/python3/lib:$LD_LIBRARY_PATH | ||
export PATH=$DSLSW_INSTALL_DIR/ompi/bin:$DSLSW_INSTALL_DIR/python3/bin:$PATH | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#!/bin/bash | ||
|
||
# Source the shared basics | ||
source ./basics.v1.0.0.sh | ||
|
||
echo " === GDR Copy (requires kernel running on the box) === " | ||
#cd $DSLSW_BASE/gdrcopy-$DSLSW_GDRCOPY_VER | ||
#make prefix=$DSLSW_INSTALL_DIR/gdrcopy CUDA=$CUDA_DIR all install | ||
#exit 0 | ||
|
||
echo " === UCX === " | ||
cd $DSLSW_BASE/ucx-$DSLSW_UCX_VER | ||
./configure --prefix=$DSLSW_INSTALL_DIR/ucx \ | ||
--enable-optimizations \ | ||
--disable-logging \ | ||
--disable-debug \ | ||
--disable-assertions \ | ||
--disable-params-check \ | ||
--without-xpmem \ | ||
--without-java \ | ||
--without-go \ | ||
--with-cuda=$CUDA_DIR \ | ||
--with-gdrcopy=/usr/src/gdrdrv-$DSLSW_GDRCOPY_VER/ | ||
|
||
make -j 32 install | ||
#exit 0 | ||
|
||
echo " === OpenMPI === " | ||
|
||
# NSL lib (-lnsl) was not symlink from libnsl.so.1 which lead to issues (--disable-getpwuid is an attempt to squash that, which seems unsucessful). Potentially, removing the LSF scheduler build would work. | ||
|
||
# libxml2 has a /zero on it's init (https://gitlab.gnome.org/GNOME/libxml2/-/blob/7846b0a677f8d3ce72486125fa281e92ac9970e8/xpath.c#L505) which seems to trigger a sigfpe. Relying on the internal but potentially wobly XML parser of OMPI | ||
|
||
cd $DSLSW_BASE/openmpi-${DSLSW_OMPI_VER} | ||
./configure --prefix=$DSLSW_INSTALL_DIR/ompi \ | ||
--disable-libxml2 \ | ||
--disable-wrapper-rpath \ | ||
--disable-wrapper-runpath \ | ||
--with-pmix \ | ||
--with-cuda=$CUDA_DIR \ | ||
--with-cuda-libdir=$CUDA_DIR/lib64/stubs \ | ||
--with-ucx=$DSLSW_INSTALL_DIR/ucx \ | ||
--with-slurm \ | ||
--enable-mpi1-compatibility | ||
|
||
make -j32 all | ||
make install | ||
export PATH=$DSLSW_INSTALL_DIR/ompi/bin:$PATH | ||
export LD_LIBRARY_PATH=$DSLSW_INSTALL_DIR/ompi/lib:$DSLSW_INSTALL_DIR/ucx/lib:$LD_LIBRARY_PATH | ||
|
||
echo " === OSU === " | ||
|
||
cd $DSLSW_BASE/osu-micro-benchmarks-$DSLSW_OSUMICRO_VER | ||
./configure \ | ||
--prefix=$DSLSW_INSTALL_DIR/osu \ | ||
--enable-cuda \ | ||
--with-cuda-include=$CUDA_DIR/include \ | ||
--with-cuda=$CUDA_DIR \ | ||
--with-cuda-libpath=$CUDA_DIR/lib64/stubs/ | ||
|
||
make -j32 | ||
make install | ||
|
||
echo " === Lapack === " | ||
cd $DSLSW_BASE/lapack-$DSLSW_LAPACK_VER | ||
mkdir build | ||
cd build | ||
cmake .. -DCMAKE_INSTALL_PREFIX=$DSLSW_INSTALL_DIR/lapack | ||
make -j32 install | ||
|
||
echo " === Python === " | ||
cd $DSLSW_BASE/Python-$DSLSW_PY_VER | ||
./configure --prefix=$DSLSW_INSTALL_DIR/python3 --enable-shared --enable-optimizations | ||
|
||
make -j32 | ||
make install | ||
|
||
echo " === Serialbox === " | ||
cd $DSLSW_BASE/serialbox-$DSLSW_SERIALBOX_VER | ||
mkdir build | ||
cd build | ||
cmake -DCMAKE_INSTALL_PREFIX=$DSLSW_INSTALL_DIR/serialbox \ | ||
-DSERIALBOX_ENABLE_FORTRAN=ON \ | ||
-DSERIALBOX_EXAMPLES=OFF \ | ||
.. | ||
make -j32 install | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
source ./basics.v1.0.0.sh | ||
|
||
cd $DSLSW_BASE | ||
git clone --recurse-submodules -b v$DSLSW_BASELIBS_VER https://github.com/GEOS-ESM/ESMA-Baselibs.git ./baselibs/$DSLSW_BASELIBS_VER | ||
cd ./baselibs/$DSLSW_BASELIBS_VER | ||
make download | ||
echo "=>Baselibs >> Removing HDF4 from the ESSENTIALS" | ||
sed -i 's/ESSENTIAL_DIRS = jpeg zlib szlib hdf4 hdf5/ESSENTIAL_DIRS = jpeg zlib szlib hdf5/g' GNUmakefile | ||
sed -i 's/\/zlib \/szlib \/jpeg \/hdf5 \/hdf \/netcdf,\\/\/ \/zlib \/szlib \/jpeg \/hdf5 \/netcdf,\\/g' GNUmakefile | ||
cd $DSLSW_BASE | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/sh | ||
|
||
# Source the share basics | ||
source ./basics.v1.0.0.sh | ||
|
||
cd $DSLSW_BASE | ||
|
||
# GDR Copy should be present in /usr/src/gdrdrv-* | ||
#wget -c https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v$DSLSW_GDRCOPY_VER.tar.gz | ||
#tar zxpvf v$DSLSW_GDRCOPY_VER.tar.gz | ||
#rm v$DSLSW_GDRCOPY_VER.tar.gz | ||
|
||
wget https://github.com/openucx/ucx/releases/download/v${DSLSW_UCX_VER}/ucx-${DSLSW_UCX_VER}.tar.gz | ||
tar xfp ucx-$DSLSW_UCX_VER.tar.gz | ||
rm ucx-$DSLSW_UCX_VER.tar.gz | ||
|
||
wget https://download.open-mpi.org/release/open-mpi/v$DSLSW_OMPI_MAJOR_VER/openmpi-${DSLSW_OMPI_VER}.tar.gz | ||
tar xfzp openmpi-$DSLSW_OMPI_VER.tar.gz | ||
rm openmpi-$DSLSW_OMPI_VER.tar.gz | ||
|
||
wget https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-$DSLSW_OSUMICRO_VER.tar.gz | ||
tar xfp osu-micro-benchmarks-$DSLSW_OSUMICRO_VER.tar.gz | ||
rm osu-micro-benchmarks-$DSLSW_OSUMICRO_VER.tar.gz | ||
|
||
wget https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v$DSLSW_LAPACK_VER.tar.gz | ||
tar xfzp v$DSLSW_LAPACK_VER.tar.gz | ||
rm v$DSLSW_LAPACK_VER.tar.gz | ||
|
||
wget https://www.python.org/ftp/python/$DSLSW_PY_VER/Python-$DSLSW_PY_VER.tgz | ||
tar zxpvf Python-$DSLSW_PY_VER.tgz | ||
rm Python-$DSLSW_PY_VER.tgz | ||
|
||
wget https://github.com/GridTools/serialbox/archive/refs/tags/v$DSLSW_SERIALBOX_VER.tar.gz | ||
mv v$DSLSW_SERIALBOX_VER.tar.gz serialbox-$DSLSW_SERIALBOX_VER.tar.gz | ||
tar zxpvf serialbox-$DSLSW_SERIALBOX_VER.tar.gz | ||
rm serialbox-$DSLSW_SERIALBOX_VER.tar.gz | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
source ./basics.v1.0.0.sh | ||
|
||
echo $DSLSW_INSTALL_DIR | ||
echo `which $FC` | ||
echo `which $CC` | ||
|
||
echo $LD_LIBRARY_PATH | ||
|