Optimise CudaCompute::GetMaximalLength() #92

KCL-BMEIS · Oct 31, 2023 · 1c315f1 · 1c315f1
1 parent 5eb3163
commit 1c315f1
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 33 deletions.
diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
@@ -1 +1 @@
-355
+356
diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu
@@ -132,7 +132,9 @@ void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int ac
 double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {
     if (!optimiseX && !optimiseY && !optimiseZ) return 0;
     CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
-    const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(con.F3dContent::GetTransformationGradient(), 3);
+    nifti_image *transGrad = con.F3dContent::GetTransformationGradient();
+    const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(transGrad, 3);
+    if (transGrad->nz <= 1) optimiseZ = false;
     return Cuda::GetMaximalLength(con.GetTransformationGradientCuda(), voxelsPerVolume, optimiseX, optimiseY, optimiseZ);
 }
 /* *************************************************************** */

diff --git a/reg-lib/cuda/CudaNormaliseGradient.cu b/reg-lib/cuda/CudaNormaliseGradient.cu
@@ -2,44 +2,47 @@
 #include "_reg_tools_gpu.h"
 
 /* *************************************************************** */
-__global__ static void GetMaximalLengthKernel(float *dists,
-                                              cudaTextureObject_t imageTexture,
-                                              const unsigned nVoxels,
-                                              const bool optimiseX,
-                                              const bool optimiseY,
-                                              const bool optimiseZ) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid < nVoxels) {
-        float4 gradValue = tex1Dfetch<float4>(imageTexture, tid);
-        dists[tid] = sqrtf((optimiseX ? Square(gradValue.x) : 0) +
-                           (optimiseY ? Square(gradValue.y) : 0) +
-                           (optimiseZ ? Square(gradValue.z) : 0));
-    }
+template<bool optimiseX, bool optimiseY, bool optimiseZ>
+float GetMaximalLength(const float4 *imageCuda, const size_t nVoxels) {
+    auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
+                                                     nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
+    auto imageTexture = *imageTexturePtr;
+    thrust::counting_iterator<unsigned> index(0);
+    return thrust::transform_reduce(thrust::device, index, index + nVoxels, [=]__device__(const unsigned index) {
+        const float4& val = tex1Dfetch<float4>(imageTexture, index);
+        return sqrtf((optimiseX ? Square(val.x) : 0) +
+                     (optimiseY ? Square(val.y) : 0) +
+                     (optimiseZ ? Square(val.z) : 0));
+    }, 0.f, thrust::maximum<float>());
+}
+/* *************************************************************** */
+template<bool optimiseX, bool optimiseY>
+static inline float GetMaximalLength(const float4 *imageCuda,
+                                     const size_t nVoxels,
+                                     const bool optimiseZ) {
+    auto getMaximalLength = GetMaximalLength<optimiseX, optimiseY, true>;
+    if (!optimiseZ) getMaximalLength = GetMaximalLength<optimiseX, optimiseY, false>;
+    return getMaximalLength(imageCuda, nVoxels);
+}
+/* *************************************************************** */
+template<bool optimiseX>
+static inline float GetMaximalLength(const float4 *imageCuda,
+                                     const size_t nVoxels,
+                                     const bool optimiseY,
+                                     const bool optimiseZ) {
+    auto getMaximalLength = GetMaximalLength<optimiseX, true>;
+    if (!optimiseY) getMaximalLength = GetMaximalLength<optimiseX, false>;
+    return getMaximalLength(imageCuda, nVoxels, optimiseZ);
 }
 /* *************************************************************** */
 float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda,
                                        const size_t nVoxels,
                                        const bool optimiseX,
                                        const bool optimiseY,
                                        const bool optimiseZ) {
-    // Create a texture object for the imageCuda
-    auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
-                                                  nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
-
-    float *dists = nullptr;
-    NR_CUDA_SAFE_CALL(cudaMalloc(&dists, nVoxels * sizeof(float)));
-
-    const unsigned threads = CudaContext::GetBlockSize()->GetMaximalLength;
-    const unsigned blocks = static_cast<unsigned>(Ceil(sqrtf(static_cast<float>(nVoxels) / static_cast<float>(threads))));
-    dim3 blockDims(threads, 1, 1);
-    dim3 gridDims(blocks, blocks, 1);
-    GetMaximalLengthKernel<<<gridDims, blockDims>>>(dists, *imageTexture, static_cast<unsigned>(nVoxels), optimiseX, optimiseY, optimiseZ);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-
-    const float maxDistance = reg_maxReduction_gpu(dists, nVoxels);
-    NR_CUDA_SAFE_CALL(cudaFree(dists));
-
-    return maxDistance;
+    auto getMaximalLength = ::GetMaximalLength<true>;
+    if (!optimiseX) getMaximalLength = ::GetMaximalLength<false>;
+    return getMaximalLength(imageCuda, nVoxels, optimiseY, optimiseZ);
 }
 /* *************************************************************** */
 template<bool optimiseX, bool optimiseY, bool optimiseZ>