Skip to content

Commit

Permalink
Optimise Cuda::VoxelCentricToNodeCentric() #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Jan 9, 2024
1 parent 29647ad commit a73014e
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 30 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
380
381
6 changes: 0 additions & 6 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ struct BlockSize {
unsigned reg_affine_getDeformationField;
unsigned GetApproxJacobianValues2d;
unsigned GetApproxJacobianValues3d;
unsigned ApproxLinearEnergyGradient;
unsigned GetJacobianValues2d;
unsigned GetJacobianValues3d;
unsigned LogSquaredValues;
Expand All @@ -30,7 +29,6 @@ struct BlockSize {
unsigned DefFieldCompose2d;
unsigned DefFieldCompose3d;
unsigned GetJacobianMatrix;
unsigned VoxelCentricToNodeCentric;
unsigned ConvertNmiGradientFromVoxelToRealSpace;
unsigned ApplyConvolutionWindowAlongX;
unsigned ApplyConvolutionWindowAlongY;
Expand All @@ -42,7 +40,6 @@ struct BlockSize100: public BlockSize {
reg_affine_getDeformationField = 512; // 16 reg - 24 smem
GetApproxJacobianValues2d = 384; // 17 reg - 104 smem - 36 cmem
GetApproxJacobianValues3d = 256; // 27 reg - 356 smem - 108 cmem
ApproxLinearEnergyGradient = 384; // 40 reg
GetJacobianValues2d = 256; // 29 reg - 32 smem - 16 cmem - 32 lmem
GetJacobianValues3d = 192; // 41 reg - 6176 smem - 20 cmem - 32 lmem
LogSquaredValues = 384; // 07 reg - 24 smem - 36 cmem
Expand All @@ -55,7 +52,6 @@ struct BlockSize100: public BlockSize {
DefFieldCompose2d = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem
DefFieldCompose3d = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem
GetJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
VoxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem
ConvertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem
Expand All @@ -69,7 +65,6 @@ struct BlockSize300: public BlockSize {
reg_affine_getDeformationField = 1024; // 23 reg
GetApproxJacobianValues2d = 768; // 34 reg
GetApproxJacobianValues3d = 640; // 46 reg
ApproxLinearEnergyGradient = 768; // 40 reg
GetJacobianValues2d = 768; // 34 reg
GetJacobianValues3d = 768; // 34 reg
LogSquaredValues = 1024; // 23 reg
Expand All @@ -82,7 +77,6 @@ struct BlockSize300: public BlockSize {
DefFieldCompose2d = 1024; // 23 reg
DefFieldCompose3d = 1024; // 24 reg
GetJacobianMatrix = 768; // 34 reg
VoxelCentricToNodeCentric = 1024; // 23 reg
ConvertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
ApplyConvolutionWindowAlongX = 1024; // 25 reg
ApplyConvolutionWindowAlongY = 1024; // 25 reg
Expand Down
15 changes: 9 additions & 6 deletions reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,12 +268,15 @@ void CudaCompute::ConvolveImage(const nifti_image *image, float4 *imageCuda) {
void CudaCompute::VoxelCentricToNodeCentric(float weight) {
CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
const mat44 *reorientation = Content::GetIJKMatrix(*con.Content::GetFloating());
Cuda::VoxelCentricToNodeCentric(con.F3dContent::GetTransformationGradient(),
con.F3dContent::GetVoxelBasedMeasureGradient(),
con.GetTransformationGradientCuda(),
con.GetVoxelBasedMeasureGradientCuda(),
weight,
reorientation);
const nifti_image *transGrad = con.F3dContent::GetTransformationGradient();
auto voxelCentricToNodeCentric = transGrad->nz > 1 ? Cuda::VoxelCentricToNodeCentric<true> :
Cuda::VoxelCentricToNodeCentric<false>;
voxelCentricToNodeCentric(transGrad,
con.F3dContent::GetVoxelBasedMeasureGradient(),
con.GetTransformationGradientCuda(),
con.GetVoxelBasedMeasureGradientCuda(),
weight,
reorientation);
}
/* *************************************************************** */
void CudaCompute::ConvolveVoxelBasedMeasureGradient(float weight) {
Expand Down
18 changes: 8 additions & 10 deletions reg-lib/cuda/CudaTools.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,19 @@
/* *************************************************************** */
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<bool is3d>
void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
const nifti_image *voxelImage,
float4 *nodeImageCuda,
float4 *voxelImageCuda,
float weight,
const mat44 *voxelToMillimetre) {
const bool is3d = nodeImage->nz > 1;
const size_t nodeNumber = NiftiImage::calcVoxelNumber(nodeImage, 3);
const size_t voxelNumber = NiftiImage::calcVoxelNumber(voxelImage, 3);
const int3 nodeImageDims = make_int3(nodeImage->nx, nodeImage->ny, nodeImage->nz);
const int3 voxelImageDims = make_int3(voxelImage->nx, voxelImage->ny, voxelImage->nz);
auto voxelImageTexture = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
auto voxelImageTexturePtr = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
auto voxelImageTexture = *voxelImageTexturePtr;

// The transformation between the image and the grid
mat44 transformation;
Expand Down Expand Up @@ -69,15 +70,12 @@ void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
weight *= ratio[i];
}

const unsigned blocks = CudaContext::GetBlockSize()->VoxelCentricToNodeCentric;
const unsigned grids = (unsigned)Ceil(sqrtf((float)nodeNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
auto voxelCentricToNodeCentricKernel = is3d ? VoxelCentricToNodeCentricKernel<true> : VoxelCentricToNodeCentricKernel<false>;
voxelCentricToNodeCentricKernel<<<gridDims, blockDims>>>(nodeImageCuda, *voxelImageTexture, (unsigned)nodeNumber, nodeImageDims,
voxelImageDims, weight, transformation, reorientation);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nodeNumber, [=]__device__(const int index) {
VoxelCentricToNodeCentricKernel<is3d>(nodeImageCuda, voxelImageTexture, nodeImageDims, voxelImageDims, weight, transformation, reorientation, index);
});
}
template void VoxelCentricToNodeCentric<false>(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*);
template void VoxelCentricToNodeCentric<true>(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*);
/* *************************************************************** */
void ConvertNmiGradientFromVoxelToRealSpace(const mat44 *sourceMatrixXYZ,
const nifti_image *controlPointImage,
Expand Down
1 change: 1 addition & 0 deletions reg-lib/cuda/CudaTools.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
/* *************************************************************** */
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<bool is3d>
void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
const nifti_image *voxelImage,
float4 *nodeImageCuda,
Expand Down
12 changes: 5 additions & 7 deletions reg-lib/cuda/CudaToolsKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,16 @@
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<bool is3d>
__global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
__device__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
cudaTextureObject_t voxelImageTexture,
const unsigned nodeNumber,
const int3 nodeImageDims,
const int3 voxelImageDims,
const float weight,
const mat44 transformation,
const mat33 reorientation) {
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid >= nodeNumber) return;
const mat33 reorientation,
const int index) {
// Calculate the node coordinates
const auto [x, y, z] = reg_indexToDims_cuda<is3d>(tid, nodeImageDims);
const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, nodeImageDims);
// Transform into voxel coordinates
float voxelCoord[3], nodeCoord[3] = { static_cast<float>(x), static_cast<float>(y), static_cast<float>(z) };
reg_mat44_mul_cuda<is3d>(transformation, nodeCoord, voxelCoord);
Expand Down Expand Up @@ -67,7 +65,7 @@ __global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,

float reorientedValue[3];
reg_mat33_mul_cuda<is3d>(reorientation, interpolatedValue, weight, reorientedValue);
nodeImageCuda[tid] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 };
nodeImageCuda[index] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 };
}
/* *************************************************************** */
__global__ void ConvertNmiGradientFromVoxelToRealSpaceKernel(float4 *gradient, const mat44 matrix, const unsigned nodeNumber) {
Expand Down

0 comments on commit a73014e

Please sign in to comment.