Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WLM integration of GPU support #171

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
6 changes: 1 addition & 5 deletions extra/CI/integration/test_gpu_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@ class TestGPUSupport(unittest.TestCase):
"libnvidia-ml.so",
"libnvidia-fatbinaryloader.so",
"libnvidia-opencl.so" }
_GPU_BINS = { "nvidia-cuda-mps-control",
"nvidia-cuda-mps-server",
"nvidia-debugdump",
"nvidia-persistenced",
"nvidia-smi"}
_GPU_BINS = { "nvidia-smi" }
_GPU_ENV_LD_LIB_PATH = {"/opt/shifter/site-resources/gpu/lib", "/opt/shifter/site-resources/gpu/lib64"}
_GPU_ENV_PATH = {"/opt/shifter/site-resources/gpu/bin"}

Expand Down
34 changes: 8 additions & 26 deletions src/activate_gpu_support.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#this script with an empty environment
export PATH=/usr/local/bin:/usr/bin:/bin:/sbin

cuda_devices=
container_root_dir=
container_site_resources=
is_verbose_active=
Expand All @@ -22,11 +21,7 @@ nvidia_compute_libs="cuda \
nvidia-opencl"

#the NVIDIA binaries that will be bind mounted into the container
nvidia_binaries="nvidia-cuda-mps-control \
nvidia-cuda-mps-server \
nvidia-debugdump \
nvidia-persistenced \
nvidia-smi"
nvidia_binaries="nvidia-smi"

log()
{
Expand Down Expand Up @@ -72,19 +67,18 @@ bind_mount_file_into_container()

parse_command_line_arguments()
{
if [ ! $# -eq 4 ]; then
if [ ! $# -eq 3 ]; then
log ERROR "Internal error: received bad number of command line arguments"
exit 1
fi

cuda_devices=$1
container_root_dir=$2
container_site_resources=$3
container_root_dir=$1
container_site_resources=$2
container_bin_path=$container_site_resources/gpu/bin
container_lib_path=$container_site_resources/gpu/lib
container_lib64_path=$container_site_resources/gpu/lib64

local verbose=$4
local verbose=$3
if [ $verbose = "verbose-on" ]; then
is_verbose_active=true
elif [ $verbose = "verbose-off" ]; then
Expand Down Expand Up @@ -121,7 +115,7 @@ add_nvidia_compute_libs_to_container()
for lib in $nvidia_compute_libs; do
local libs_host=$( ldconfig -p | grep "lib${lib}.so" | awk '{print $4}' )
if [ -z "$libs_host" ]; then
log WARNING "Could not find library: $lib"
log INFO "Could not find library: $lib"
continue
fi

Expand All @@ -145,29 +139,17 @@ add_nvidia_binaries_to_container()
for bin in $nvidia_binaries; do
local bin_host="$( which $bin )"
if [ -z $bin_host ]; then
log WARNING "Could not find binary: $bin"
log INFO "Could not find binary: $bin"
continue
fi
local bin_container=$container_bin_path/$bin
bind_mount_file_into_container $bin_host $bin_container
done
}

load_nvidia_uvm_if_necessary()
{
# /dev/nvidia-uvm is available when the NVIDIA UVM kernel module is correctly loaded.
# Load the kernel module through nvidia-modprobe if /dev/nvidia-uvm doesn't exist.
if [ ! -e /dev/nvidia-uvm ]; then
log INFO "/dev/nvidia-uvm doesn't exist. Creating it with nvidia-modprobe."
nvidia-modprobe -u -c=0
exit_if_previous_command_failed "Cannot nvidia-modprobe -u -c=0"
fi
}

parse_command_line_arguments $*
validate_command_line_arguments
log INFO "Activating support for CUDA devices $cuda_devices."
log INFO "Activating GPU support"
check_prerequisites
add_nvidia_compute_libs_to_container
add_nvidia_binaries_to_container
load_nvidia_uvm_if_necessary
14 changes: 4 additions & 10 deletions src/gpu_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,9 @@ int parse_gpu_env(struct gpu_support_config* config) {
if( cuda_visible_devices != NULL
&& strcmp(cuda_visible_devices, "") != 0
&& strcmp(cuda_visible_devices, "NoDevFiles") != 0) {
config->gpu_ids = strdup(cuda_visible_devices);
config->is_gpu_support_enabled = 1;
}
else {
config->gpu_ids = NULL;
config->is_gpu_support_enabled = 0;
}
return 0;
Expand All @@ -37,11 +35,10 @@ int execute_hook_to_activate_gpu_support(int verbose, const UdiRootConfig* udiCo
char* args[8];
args[0] = strdup("/bin/bash");
args[1] = script_path;
args[2] = strdup(udiConfig->gpu_config.gpu_ids);
args[3] = strdup(udiConfig->udiMountPoint);
args[4] = strdup(udiConfig->siteResources);
args[5] = verbose ? strdup("verbose-on") : strdup("verbose-off");
args[6] = NULL;
args[2] = strdup(udiConfig->udiMountPoint);
args[3] = strdup(udiConfig->siteResources);
args[4] = verbose ? strdup("verbose-on") : strdup("verbose-off");
args[5] = NULL;

ret = forkAndExecv(args);

Expand All @@ -58,12 +55,9 @@ int fprint_gpu_support_config(FILE* fp, const struct gpu_support_config* config)
{
size_t written = 0;
written += fprintf(fp, "***** GPU support config *****\n");
written += fprintf(fp, "gpu_ids = %s\n", config->gpu_ids);
written += fprintf(fp, "is_gpu_support_enabled = %d\n", config->is_gpu_support_enabled);
return written;
}

void free_gpu_support_config(struct gpu_support_config* config) {
free(config->gpu_ids);
config->gpu_ids = NULL;
}
1 change: 0 additions & 1 deletion src/gpu_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ extern "C" {
struct _UdiRootConfig;

struct gpu_support_config {
char* gpu_ids;
int is_gpu_support_enabled;
};

Expand Down
4 changes: 3 additions & 1 deletion src/setupRoot.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ int main(int argc, char **argv) {
UdiRootConfig udiConfig;
SetupRootConfig config;
ImageData image;
struct gpu_support_config gpu_config = {};

memset(&udiConfig, 0, sizeof(UdiRootConfig));
memset(&config, 0, sizeof(SetupRootConfig));
Expand All @@ -112,6 +111,9 @@ int main(int argc, char **argv) {
fprintf(stderr, "FAILED to parse udiRoot configuration. Exiting.\n");
exit(1);
}

udiConfig.gpu_config.is_gpu_support_enabled = 1; //always attempt to enable GPU support

udiConfig.target_uid = config.uid;
udiConfig.target_gid = config.gid;
udiConfig.auxiliary_gids = shifter_getgrouplist(config.user, udiConfig.target_gid, &(udiConfig.nauxiliary_gids));
Expand Down
5 changes: 0 additions & 5 deletions src/test/test_gpu_support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,39 +15,34 @@ TEST(GPUSupportTestGroup, parseGPUenv_test) {
{
parse_gpu_env(&config);
CHECK(config.is_gpu_support_enabled == 0);
CHECK(config.gpu_ids == NULL);
free_gpu_support_config(&config);
}
// CUDA_VISIBLE_DEVICES= (no value)
{
setenv("CUDA_VISIBLE_DEVICES", "", 1);
parse_gpu_env(&config);
CHECK(config.is_gpu_support_enabled == 0);
CHECK(config.gpu_ids == NULL);
free_gpu_support_config(&config);
}
// CUDA_VISIBLE_DEVICES=NoDevFiles
{
setenv("CUDA_VISIBLE_DEVICES", "NoDevFiles", 1);
parse_gpu_env(&config);
CHECK(config.is_gpu_support_enabled == 0);
CHECK(config.gpu_ids == NULL);
free_gpu_support_config(&config);
}
// CUDA_VISIBLE_DEVICES=0
{
setenv("CUDA_VISIBLE_DEVICES", "0", 1);
parse_gpu_env(&config);
CHECK(config.is_gpu_support_enabled == 1);
CHECK(config.gpu_ids == std::string("0"));
free_gpu_support_config(&config);
}
// CUDA_VISIBLE_DEVICES=0,1
{
setenv("CUDA_VISIBLE_DEVICES", "0,1", 1);
parse_gpu_env(&config);
CHECK(config.is_gpu_support_enabled == 1);
CHECK(config.gpu_ids == std::string("0,1"));
free_gpu_support_config(&config);
}
}
Expand Down