Use PyTorch's p2p access enable function (pytorch#2000)

Summary: Pull Request resolved: pytorch#2000 We split the diff after adding a needed lazy cuda init call in enable p2p access function. Diff 1: D48939723 [PyTorch] Add the lazy init call for p2p access function *Prior context* cudaEnablePeerAccess only enables cross device access for memory allocated with cudaMalloc. When using other cuda APIs such cuMemMap, peer access is managed differently. expandable_segments:True in PyTorch uses cuMemMap, so code that just calls cudaEnablePeerAccess is not sufficient to enable cross-device copies. This patch switching the p2p access enabling functions to use PyTorchs `get_p2p_access` which lets its allocator figure out how to correctly enable p2p access for that memory. In the normal case (expandable_segments:False), this code performs exactly the same cuda calls as before. Reviewed By: zdevito Differential Revision: D49021817 fbshipit-source-id: 7ffb4b477b1d1cddccc891dd9fc8f9a2a986585e
q10 · Sep 14, 2023 · 14cf6f2 · 14cf6f2
1 parent 66a53cc
commit 14cf6f2
Showing 1 changed file with 2 additions and 9 deletions.
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -10,6 +10,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <c10/core/Device.h>
 #include <c10/core/TensorOptions.h>
@@ -562,15 +563,7 @@ void init_p2p_access() {
     for (const auto i : c10::irange(at::cuda::getNumGPUs())) {
       for (const auto j : c10::irange(at::cuda::getNumGPUs())) {
         if (i != j) {
-          at::cuda::CUDAGuard g(i);
-          const auto err =
-              C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
-          if (err == cudaErrorPeerAccessAlreadyEnabled) {
-            // ignore and clear the error if access was already enabled
-            C10_CUDA_CLEAR_ERROR();
-          } else {
-            AT_CUDA_CHECK(err);
-          }
+          AT_ASSERT(at::cuda::get_p2p_access(i, j));
         }
       }
     }