You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#include <cuda_runtime.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
int myrank;
float *val_device, *val_host;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int otherRank = myrank == 0 ? 1 : 0;
cudaSetDevice(otherRank);
cudaSetDevice(myrank);
int num = 1000000;
val_host = (float *)malloc(sizeof(float) * num);
cudaMalloc((void **)&val_device, sizeof(float) * num);
for (int i = 0; i < 1; i++) {
*val_host = -1.0;
if (myrank != 0) {
if (i == 0)
printf("%s %d %s %f\n", "I am rank", myrank,
"and my initial value is:", *val_host);
}
if (myrank == 0) {
*val_host = 42.0;
cudaMemcpy(val_device, val_host, sizeof(float), cudaMemcpyHostToDevice);
if (i == 0)
printf("%s %d %s %f\n", "I am rank", myrank,
"and will broadcast value:", *val_host);
}
MPI_Bcast(val_device, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
if (myrank != 0) {
cudaMemcpy(val_host, val_device, sizeof(float), cudaMemcpyDeviceToHost);
if (i == 0)
printf("%s %d %s %f\n", "I am rank", myrank,
"and received broadcasted value:", *val_host);
}
}
cudaFree(val_device);
free(val_host);
MPI_Finalize();
return 0;
}
Then execute the above using two MPI ranks on a node with at least two gpus available.
It seems that the above example leads to data leaks for all cuda-aware MPI implementations (cray-mpi, OPENMPI/ MPICH, built with UCX or openfabric (I think, but not certain)), and this isn't specific to e.g. MPI_Bcast, using MPI_Send etc also has the same effect.
I think it might be implied by a few things that I've read that standard MPI does not support multiple gpus to a single rank, and that things like MPI endpoints or nccl (https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-3-multiple-devices-per-thread) might be used in such a case.
However I cannot find anything explicitly stating this, so it would be good if someone could confirm this?
If not then the above reproducer is a bug report for this case.
Thanks
The text was updated successfully, but these errors were encountered:
To be precise is the following code example that maps two gpus to a single MPI rank valid, or is it Undefined Behavior in the standard cuda-aware MPI?
In particular note the part
Where I set a cuda device that I don't use in that rank before correctly setting the device that I want to map the the MPI rank for MPI invocations. According to e.g. these docs https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html#when-do-i-need-to-select-a-cuda-device
nothing is specified that tells me this is invalid.
Here is the full example:
Then execute the above using two MPI ranks on a node with at least two gpus available.
It seems that the above example leads to data leaks for all cuda-aware MPI implementations (cray-mpi, OPENMPI/ MPICH, built with UCX or openfabric (I think, but not certain)), and this isn't specific to e.g.
MPI_Bcast
, usingMPI_Send
etc also has the same effect.I think it might be implied by a few things that I've read that standard MPI does not support multiple gpus to a single rank, and that things like MPI endpoints or nccl (https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-3-multiple-devices-per-thread) might be used in such a case.
However I cannot find anything explicitly stating this, so it would be good if someone could confirm this?
If not then the above reproducer is a bug report for this case.
Thanks
The text was updated successfully, but these errors were encountered: