Skip to content

Commit

Permalink
Optimise CudaCompute::GetMaximalLength() #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Oct 31, 2023
1 parent 5eb3163 commit 1c315f1
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 33 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
355
356
4 changes: 3 additions & 1 deletion reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int ac
double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {
if (!optimiseX && !optimiseY && !optimiseZ) return 0;
CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(con.F3dContent::GetTransformationGradient(), 3);
nifti_image *transGrad = con.F3dContent::GetTransformationGradient();
const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(transGrad, 3);
if (transGrad->nz <= 1) optimiseZ = false;
return Cuda::GetMaximalLength(con.GetTransformationGradientCuda(), voxelsPerVolume, optimiseX, optimiseY, optimiseZ);
}
/* *************************************************************** */
Expand Down
65 changes: 34 additions & 31 deletions reg-lib/cuda/CudaNormaliseGradient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,47 @@
#include "_reg_tools_gpu.h"

/* *************************************************************** */
__global__ static void GetMaximalLengthKernel(float *dists,
cudaTextureObject_t imageTexture,
const unsigned nVoxels,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ) {
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < nVoxels) {
float4 gradValue = tex1Dfetch<float4>(imageTexture, tid);
dists[tid] = sqrtf((optimiseX ? Square(gradValue.x) : 0) +
(optimiseY ? Square(gradValue.y) : 0) +
(optimiseZ ? Square(gradValue.z) : 0));
}
template<bool optimiseX, bool optimiseY, bool optimiseZ>
float GetMaximalLength(const float4 *imageCuda, const size_t nVoxels) {
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
auto imageTexture = *imageTexturePtr;
thrust::counting_iterator<unsigned> index(0);
return thrust::transform_reduce(thrust::device, index, index + nVoxels, [=]__device__(const unsigned index) {
const float4& val = tex1Dfetch<float4>(imageTexture, index);
return sqrtf((optimiseX ? Square(val.x) : 0) +
(optimiseY ? Square(val.y) : 0) +
(optimiseZ ? Square(val.z) : 0));
}, 0.f, thrust::maximum<float>());
}
/* *************************************************************** */
template<bool optimiseX, bool optimiseY>
static inline float GetMaximalLength(const float4 *imageCuda,
const size_t nVoxels,
const bool optimiseZ) {
auto getMaximalLength = GetMaximalLength<optimiseX, optimiseY, true>;
if (!optimiseZ) getMaximalLength = GetMaximalLength<optimiseX, optimiseY, false>;
return getMaximalLength(imageCuda, nVoxels);
}
/* *************************************************************** */
template<bool optimiseX>
static inline float GetMaximalLength(const float4 *imageCuda,
const size_t nVoxels,
const bool optimiseY,
const bool optimiseZ) {
auto getMaximalLength = GetMaximalLength<optimiseX, true>;
if (!optimiseY) getMaximalLength = GetMaximalLength<optimiseX, false>;
return getMaximalLength(imageCuda, nVoxels, optimiseZ);
}
/* *************************************************************** */
float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda,
const size_t nVoxels,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ) {
// Create a texture object for the imageCuda
auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);

float *dists = nullptr;
NR_CUDA_SAFE_CALL(cudaMalloc(&dists, nVoxels * sizeof(float)));

const unsigned threads = CudaContext::GetBlockSize()->GetMaximalLength;
const unsigned blocks = static_cast<unsigned>(Ceil(sqrtf(static_cast<float>(nVoxels) / static_cast<float>(threads))));
dim3 blockDims(threads, 1, 1);
dim3 gridDims(blocks, blocks, 1);
GetMaximalLengthKernel<<<gridDims, blockDims>>>(dists, *imageTexture, static_cast<unsigned>(nVoxels), optimiseX, optimiseY, optimiseZ);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);

const float maxDistance = reg_maxReduction_gpu(dists, nVoxels);
NR_CUDA_SAFE_CALL(cudaFree(dists));

return maxDistance;
auto getMaximalLength = ::GetMaximalLength<true>;
if (!optimiseX) getMaximalLength = ::GetMaximalLength<false>;
return getMaximalLength(imageCuda, nVoxels, optimiseY, optimiseZ);
}
/* *************************************************************** */
template<bool optimiseX, bool optimiseY, bool optimiseZ>
Expand Down

0 comments on commit 1c315f1

Please sign in to comment.