generated from GEOS-ESM/geos-template-repo
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #25 from GEOS-ESM/feature/hardware_sampelr
Hardware Sampler & MPS
- Loading branch information
Showing
16 changed files
with
370 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import os | ||
import sys | ||
import site | ||
|
||
|
||
def find_template(name: str) -> str: | ||
# pip install geosongpu-ci | ||
candidate = f"{sys.prefix}/geosongpu/templates/{name}.tpl" | ||
if os.path.isfile(candidate): | ||
return candidate | ||
# pip install --user geosongpu-ci | ||
candidate = f"{site.USER_BASE}/geosongpu/templates/{name}.tpl" | ||
if os.path.isfile(candidate): | ||
return candidate | ||
# pip install -e geosongpu-ci | ||
candidate = os.path.join( | ||
os.path.dirname(__file__), | ||
f"{name}.tpl", | ||
) | ||
if os.path.isfile(candidate): | ||
return candidate | ||
raise FileNotFoundError(f"Template: could not locate {name}") |
76 changes: 76 additions & 0 deletions
76
geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/sh | ||
|
||
# We open GPU visibility to full node at first | ||
export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||
|
||
# Hardware sampling is a python tools that reads at intervals | ||
# various hardware sensors (power, usage, memory load...) | ||
if [ -z ${HARDWARE_SAMPLING} ]; then | ||
echo "Hardware sampling is OFF" | ||
else | ||
echo "Hardware sampling is ON" | ||
# We restrict usage to (world) rank 0 | ||
if [ $SLURM_PROCID -eq 0 ]; then | ||
geosongpu_hws server & | ||
sleep 10 | ||
geosongpu_hws client start | ||
fi | ||
|
||
fi | ||
|
||
if [ -z ${MPS_ON} ]; then | ||
echo "MPS is OFF" | ||
# No MPS, we assume rank==GPU | ||
GPU=$SLURM_LOCALID | ||
export CUDA_VISIBLE_DEVICES=$GPU | ||
else | ||
echo "MPS is ON" | ||
if [ -z ${PER_DEVICE_PROCESS} ]; then | ||
echo "PER_DEVICE_PROCESS needs to be setup on MPS. Exiting." | ||
exit 1 | ||
fi | ||
# All ranks needs to know where to look | ||
export CUDA_MPS_PIPE_DIRECTORY=./nvidia-mps/$SLURM_NODEID | ||
export CUDA_MPS_LOG_DIRECTORY=./nvidia-log/$SLURM_NODEID | ||
# Only 1 rank per node (local rank 0) handles the server chatter | ||
if [ $SLURM_LOCALID -eq 0 ]; then | ||
echo "Turn nvidia-cuda-mps-control on for node $SLURM_NODEID" | ||
mkdir -p nvidia-mps | ||
mkdir -p nvidia-log/$SLURM_NODEID | ||
# sudo nividia -i 0 -c 3 # Per docs, we should insure GPU is in EXCLUSIVE mode but we might be curtail by HPC settings | ||
nvidia-cuda-mps-control -d | ||
fi | ||
# MPS server is socket base, leave time for the filesystem | ||
sleep 10 | ||
# Server should be spun, we restrict this rank to a single GPU | ||
GPU=$((SLURM_LOCALID/PER_DEVICE_PROCESS)) | ||
export CUDA_VISIBLE_DEVICES=$GPU | ||
fi | ||
|
||
|
||
echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID, pinned to GPU: $CUDA_VISIBLE_DEVICES" | ||
|
||
# Run program with or without log dump in file | ||
if [ -z ${LOCAL_REDIRECT_LOG} ]; then | ||
$* | ||
else | ||
$* > log.redirect_local.$SLURM_PROCID.out 2>&1 | ||
fi | ||
|
||
# Clean up of all tools | ||
if [ -z ${HARDWARE_SAMPLING} ]; then | ||
echo "" | ||
else | ||
if [ $SLURM_PROCID -eq 0 ]; then | ||
geosongpu_hws client dump | ||
geosongpu_hws client stop | ||
fi | ||
fi | ||
if [ -z ${MPS_ON} ]; then | ||
echo "" | ||
else | ||
if [ $SLURM_LOCALID -eq 0 ]; then | ||
echo quit | nvidia-cuda-mps-control | ||
# sudo nividia -i 0 -c 0 # Per docs, we should insure GPU is flipped back to DEFAULT mode but we might be curtail by HPC settings | ||
fi | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.