diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index c2f53117..fae51388 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -380 +381 diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp index c72420e8..a0d2ea14 100644 --- a/reg-lib/cuda/BlockSize.hpp +++ b/reg-lib/cuda/BlockSize.hpp @@ -17,7 +17,6 @@ struct BlockSize { unsigned reg_affine_getDeformationField; unsigned GetApproxJacobianValues2d; unsigned GetApproxJacobianValues3d; - unsigned ApproxLinearEnergyGradient; unsigned GetJacobianValues2d; unsigned GetJacobianValues3d; unsigned LogSquaredValues; @@ -30,7 +29,6 @@ struct BlockSize { unsigned DefFieldCompose2d; unsigned DefFieldCompose3d; unsigned GetJacobianMatrix; - unsigned VoxelCentricToNodeCentric; unsigned ConvertNmiGradientFromVoxelToRealSpace; unsigned ApplyConvolutionWindowAlongX; unsigned ApplyConvolutionWindowAlongY; @@ -42,7 +40,6 @@ struct BlockSize100: public BlockSize { reg_affine_getDeformationField = 512; // 16 reg - 24 smem GetApproxJacobianValues2d = 384; // 17 reg - 104 smem - 36 cmem GetApproxJacobianValues3d = 256; // 27 reg - 356 smem - 108 cmem - ApproxLinearEnergyGradient = 384; // 40 reg GetJacobianValues2d = 256; // 29 reg - 32 smem - 16 cmem - 32 lmem GetJacobianValues3d = 192; // 41 reg - 6176 smem - 20 cmem - 32 lmem LogSquaredValues = 384; // 07 reg - 24 smem - 36 cmem @@ -55,7 +52,6 @@ struct BlockSize100: public BlockSize { DefFieldCompose2d = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem DefFieldCompose3d = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem GetJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem - VoxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem ConvertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem @@ -69,7 +65,6 @@ struct BlockSize300: public BlockSize { reg_affine_getDeformationField = 1024; // 23 reg GetApproxJacobianValues2d = 768; // 34 reg GetApproxJacobianValues3d = 640; // 46 reg - ApproxLinearEnergyGradient = 768; // 40 reg GetJacobianValues2d = 768; // 34 reg GetJacobianValues3d = 768; // 34 reg LogSquaredValues = 1024; // 23 reg @@ -82,7 +77,6 @@ struct BlockSize300: public BlockSize { DefFieldCompose2d = 1024; // 23 reg DefFieldCompose3d = 1024; // 24 reg GetJacobianMatrix = 768; // 34 reg - VoxelCentricToNodeCentric = 1024; // 23 reg ConvertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg ApplyConvolutionWindowAlongX = 1024; // 25 reg ApplyConvolutionWindowAlongY = 1024; // 25 reg diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index 5d663a4f..7b49be10 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -268,12 +268,15 @@ void CudaCompute::ConvolveImage(const nifti_image *image, float4 *imageCuda) { void CudaCompute::VoxelCentricToNodeCentric(float weight) { CudaF3dContent& con = dynamic_cast(this->con); const mat44 *reorientation = Content::GetIJKMatrix(*con.Content::GetFloating()); - Cuda::VoxelCentricToNodeCentric(con.F3dContent::GetTransformationGradient(), - con.F3dContent::GetVoxelBasedMeasureGradient(), - con.GetTransformationGradientCuda(), - con.GetVoxelBasedMeasureGradientCuda(), - weight, - reorientation); + const nifti_image *transGrad = con.F3dContent::GetTransformationGradient(); + auto voxelCentricToNodeCentric = transGrad->nz > 1 ? Cuda::VoxelCentricToNodeCentric : + Cuda::VoxelCentricToNodeCentric; + voxelCentricToNodeCentric(transGrad, + con.F3dContent::GetVoxelBasedMeasureGradient(), + con.GetTransformationGradientCuda(), + con.GetVoxelBasedMeasureGradientCuda(), + weight, + reorientation); } /* *************************************************************** */ void CudaCompute::ConvolveVoxelBasedMeasureGradient(float weight) { diff --git a/reg-lib/cuda/CudaTools.cu b/reg-lib/cuda/CudaTools.cu index a8ee68ad..c84cf344 100644 --- a/reg-lib/cuda/CudaTools.cu +++ b/reg-lib/cuda/CudaTools.cu @@ -17,18 +17,19 @@ /* *************************************************************** */ namespace NiftyReg::Cuda { /* *************************************************************** */ +template void VoxelCentricToNodeCentric(const nifti_image *nodeImage, const nifti_image *voxelImage, float4 *nodeImageCuda, float4 *voxelImageCuda, float weight, const mat44 *voxelToMillimetre) { - const bool is3d = nodeImage->nz > 1; const size_t nodeNumber = NiftiImage::calcVoxelNumber(nodeImage, 3); const size_t voxelNumber = NiftiImage::calcVoxelNumber(voxelImage, 3); const int3 nodeImageDims = make_int3(nodeImage->nx, nodeImage->ny, nodeImage->nz); const int3 voxelImageDims = make_int3(voxelImage->nx, voxelImage->ny, voxelImage->nz); - auto voxelImageTexture = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto voxelImageTexturePtr = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto voxelImageTexture = *voxelImageTexturePtr; // The transformation between the image and the grid mat44 transformation; @@ -69,15 +70,12 @@ void VoxelCentricToNodeCentric(const nifti_image *nodeImage, weight *= ratio[i]; } - const unsigned blocks = CudaContext::GetBlockSize()->VoxelCentricToNodeCentric; - const unsigned grids = (unsigned)Ceil(sqrtf((float)nodeNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - auto voxelCentricToNodeCentricKernel = is3d ? VoxelCentricToNodeCentricKernel : VoxelCentricToNodeCentricKernel; - voxelCentricToNodeCentricKernel<<>>(nodeImageCuda, *voxelImageTexture, (unsigned)nodeNumber, nodeImageDims, - voxelImageDims, weight, transformation, reorientation); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nodeNumber, [=]__device__(const int index) { + VoxelCentricToNodeCentricKernel(nodeImageCuda, voxelImageTexture, nodeImageDims, voxelImageDims, weight, transformation, reorientation, index); + }); } +template void VoxelCentricToNodeCentric(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*); +template void VoxelCentricToNodeCentric(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*); /* *************************************************************** */ void ConvertNmiGradientFromVoxelToRealSpace(const mat44 *sourceMatrixXYZ, const nifti_image *controlPointImage, diff --git a/reg-lib/cuda/CudaTools.hpp b/reg-lib/cuda/CudaTools.hpp index 14e68a24..8dfcbf6d 100644 --- a/reg-lib/cuda/CudaTools.hpp +++ b/reg-lib/cuda/CudaTools.hpp @@ -18,6 +18,7 @@ /* *************************************************************** */ namespace NiftyReg::Cuda { /* *************************************************************** */ +template void VoxelCentricToNodeCentric(const nifti_image *nodeImage, const nifti_image *voxelImage, float4 *nodeImageCuda, diff --git a/reg-lib/cuda/CudaToolsKernels.cu b/reg-lib/cuda/CudaToolsKernels.cu index 54a415ba..fc38446e 100644 --- a/reg-lib/cuda/CudaToolsKernels.cu +++ b/reg-lib/cuda/CudaToolsKernels.cu @@ -14,18 +14,16 @@ namespace NiftyReg::Cuda { /* *************************************************************** */ template -__global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda, +__device__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda, cudaTextureObject_t voxelImageTexture, - const unsigned nodeNumber, const int3 nodeImageDims, const int3 voxelImageDims, const float weight, const mat44 transformation, - const mat33 reorientation) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid >= nodeNumber) return; + const mat33 reorientation, + const int index) { // Calculate the node coordinates - const auto [x, y, z] = reg_indexToDims_cuda(tid, nodeImageDims); + const auto [x, y, z] = reg_indexToDims_cuda(index, nodeImageDims); // Transform into voxel coordinates float voxelCoord[3], nodeCoord[3] = { static_cast(x), static_cast(y), static_cast(z) }; reg_mat44_mul_cuda(transformation, nodeCoord, voxelCoord); @@ -67,7 +65,7 @@ __global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda, float reorientedValue[3]; reg_mat33_mul_cuda(reorientation, interpolatedValue, weight, reorientedValue); - nodeImageCuda[tid] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 }; + nodeImageCuda[index] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 }; } /* *************************************************************** */ __global__ void ConvertNmiGradientFromVoxelToRealSpaceKernel(float4 *gradient, const mat44 matrix, const unsigned nodeNumber) {