diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index 8941db59..53d5a5ad 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -355 +356 diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index cae2fd12..f255b635 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -132,7 +132,9 @@ void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int ac double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) { if (!optimiseX && !optimiseY && !optimiseZ) return 0; CudaF3dContent& con = dynamic_cast(this->con); - const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(con.F3dContent::GetTransformationGradient(), 3); + nifti_image *transGrad = con.F3dContent::GetTransformationGradient(); + const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(transGrad, 3); + if (transGrad->nz <= 1) optimiseZ = false; return Cuda::GetMaximalLength(con.GetTransformationGradientCuda(), voxelsPerVolume, optimiseX, optimiseY, optimiseZ); } /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaNormaliseGradient.cu b/reg-lib/cuda/CudaNormaliseGradient.cu index 62b2aa64..c61ecb13 100644 --- a/reg-lib/cuda/CudaNormaliseGradient.cu +++ b/reg-lib/cuda/CudaNormaliseGradient.cu @@ -2,19 +2,37 @@ #include "_reg_tools_gpu.h" /* *************************************************************** */ -__global__ static void GetMaximalLengthKernel(float *dists, - cudaTextureObject_t imageTexture, - const unsigned nVoxels, - const bool optimiseX, - const bool optimiseY, - const bool optimiseZ) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < nVoxels) { - float4 gradValue = tex1Dfetch(imageTexture, tid); - dists[tid] = sqrtf((optimiseX ? Square(gradValue.x) : 0) + - (optimiseY ? Square(gradValue.y) : 0) + - (optimiseZ ? Square(gradValue.z) : 0)); - } +template +float GetMaximalLength(const float4 *imageCuda, const size_t nVoxels) { + auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, + nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto imageTexture = *imageTexturePtr; + thrust::counting_iterator index(0); + return thrust::transform_reduce(thrust::device, index, index + nVoxels, [=]__device__(const unsigned index) { + const float4& val = tex1Dfetch(imageTexture, index); + return sqrtf((optimiseX ? Square(val.x) : 0) + + (optimiseY ? Square(val.y) : 0) + + (optimiseZ ? Square(val.z) : 0)); + }, 0.f, thrust::maximum()); +} +/* *************************************************************** */ +template +static inline float GetMaximalLength(const float4 *imageCuda, + const size_t nVoxels, + const bool optimiseZ) { + auto getMaximalLength = GetMaximalLength; + if (!optimiseZ) getMaximalLength = GetMaximalLength; + return getMaximalLength(imageCuda, nVoxels); +} +/* *************************************************************** */ +template +static inline float GetMaximalLength(const float4 *imageCuda, + const size_t nVoxels, + const bool optimiseY, + const bool optimiseZ) { + auto getMaximalLength = GetMaximalLength; + if (!optimiseY) getMaximalLength = GetMaximalLength; + return getMaximalLength(imageCuda, nVoxels, optimiseZ); } /* *************************************************************** */ float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda, @@ -22,24 +40,9 @@ float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda, const bool optimiseX, const bool optimiseY, const bool optimiseZ) { - // Create a texture object for the imageCuda - auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); - - float *dists = nullptr; - NR_CUDA_SAFE_CALL(cudaMalloc(&dists, nVoxels * sizeof(float))); - - const unsigned threads = CudaContext::GetBlockSize()->GetMaximalLength; - const unsigned blocks = static_cast(Ceil(sqrtf(static_cast(nVoxels) / static_cast(threads)))); - dim3 blockDims(threads, 1, 1); - dim3 gridDims(blocks, blocks, 1); - GetMaximalLengthKernel<<>>(dists, *imageTexture, static_cast(nVoxels), optimiseX, optimiseY, optimiseZ); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - - const float maxDistance = reg_maxReduction_gpu(dists, nVoxels); - NR_CUDA_SAFE_CALL(cudaFree(dists)); - - return maxDistance; + auto getMaximalLength = ::GetMaximalLength; + if (!optimiseX) getMaximalLength = ::GetMaximalLength; + return getMaximalLength(imageCuda, nVoxels, optimiseY, optimiseZ); } /* *************************************************************** */ template