diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
index c2f53117..fae51388 100644
--- a/niftyreg_build_version.txt
+++ b/niftyreg_build_version.txt
@@ -1 +1 @@
-380
+381
diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp
index c72420e8..a0d2ea14 100644
--- a/reg-lib/cuda/BlockSize.hpp
+++ b/reg-lib/cuda/BlockSize.hpp
@@ -17,7 +17,6 @@ struct BlockSize {
     unsigned reg_affine_getDeformationField;
     unsigned GetApproxJacobianValues2d;
     unsigned GetApproxJacobianValues3d;
-    unsigned ApproxLinearEnergyGradient;
     unsigned GetJacobianValues2d;
     unsigned GetJacobianValues3d;
     unsigned LogSquaredValues;
@@ -30,7 +29,6 @@ struct BlockSize {
     unsigned DefFieldCompose2d;
     unsigned DefFieldCompose3d;
     unsigned GetJacobianMatrix;
-    unsigned VoxelCentricToNodeCentric;
     unsigned ConvertNmiGradientFromVoxelToRealSpace;
     unsigned ApplyConvolutionWindowAlongX;
     unsigned ApplyConvolutionWindowAlongY;
@@ -42,7 +40,6 @@ struct BlockSize100: public BlockSize {
         reg_affine_getDeformationField = 512; // 16 reg - 24 smem
         GetApproxJacobianValues2d = 384; // 17 reg - 104 smem - 36 cmem
         GetApproxJacobianValues3d = 256; // 27 reg - 356 smem - 108 cmem
-        ApproxLinearEnergyGradient = 384; // 40 reg
         GetJacobianValues2d = 256; // 29 reg - 32 smem - 16 cmem - 32 lmem
         GetJacobianValues3d = 192; // 41 reg - 6176 smem - 20 cmem - 32 lmem
         LogSquaredValues = 384; // 07 reg - 24 smem - 36 cmem
@@ -55,7 +52,6 @@ struct BlockSize100: public BlockSize {
         DefFieldCompose2d = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem
         DefFieldCompose3d = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem
         GetJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
-        VoxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem
         ConvertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
         ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
         ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem
@@ -69,7 +65,6 @@ struct BlockSize300: public BlockSize {
         reg_affine_getDeformationField = 1024; // 23 reg
         GetApproxJacobianValues2d = 768; // 34 reg
         GetApproxJacobianValues3d = 640; // 46 reg
-        ApproxLinearEnergyGradient = 768; // 40 reg
         GetJacobianValues2d = 768; // 34 reg
         GetJacobianValues3d = 768; // 34 reg
         LogSquaredValues = 1024; // 23 reg
@@ -82,7 +77,6 @@ struct BlockSize300: public BlockSize {
         DefFieldCompose2d = 1024; // 23 reg
         DefFieldCompose3d = 1024; // 24 reg
         GetJacobianMatrix = 768; // 34 reg
-        VoxelCentricToNodeCentric = 1024; // 23 reg
         ConvertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
         ApplyConvolutionWindowAlongX = 1024; // 25 reg
         ApplyConvolutionWindowAlongY = 1024; // 25 reg
diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu
index 5d663a4f..7b49be10 100644
--- a/reg-lib/cuda/CudaCompute.cu
+++ b/reg-lib/cuda/CudaCompute.cu
@@ -268,12 +268,15 @@ void CudaCompute::ConvolveImage(const nifti_image *image, float4 *imageCuda) {
 void CudaCompute::VoxelCentricToNodeCentric(float weight) {
     CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
     const mat44 *reorientation = Content::GetIJKMatrix(*con.Content::GetFloating());
-    Cuda::VoxelCentricToNodeCentric(con.F3dContent::GetTransformationGradient(),
-                                    con.F3dContent::GetVoxelBasedMeasureGradient(),
-                                    con.GetTransformationGradientCuda(),
-                                    con.GetVoxelBasedMeasureGradientCuda(),
-                                    weight,
-                                    reorientation);
+    const nifti_image *transGrad = con.F3dContent::GetTransformationGradient();
+    auto voxelCentricToNodeCentric = transGrad->nz > 1 ? Cuda::VoxelCentricToNodeCentric<true> :
+                                                         Cuda::VoxelCentricToNodeCentric<false>;
+    voxelCentricToNodeCentric(transGrad,
+                              con.F3dContent::GetVoxelBasedMeasureGradient(),
+                              con.GetTransformationGradientCuda(),
+                              con.GetVoxelBasedMeasureGradientCuda(),
+                              weight,
+                              reorientation);
 }
 /* *************************************************************** */
 void CudaCompute::ConvolveVoxelBasedMeasureGradient(float weight) {
diff --git a/reg-lib/cuda/CudaTools.cu b/reg-lib/cuda/CudaTools.cu
index a8ee68ad..c84cf344 100644
--- a/reg-lib/cuda/CudaTools.cu
+++ b/reg-lib/cuda/CudaTools.cu
@@ -17,18 +17,19 @@
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
 /* *************************************************************** */
+template<bool is3d>
 void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
                                const nifti_image *voxelImage,
                                float4 *nodeImageCuda,
                                float4 *voxelImageCuda,
                                float weight,
                                const mat44 *voxelToMillimetre) {
-    const bool is3d = nodeImage->nz > 1;
     const size_t nodeNumber = NiftiImage::calcVoxelNumber(nodeImage, 3);
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(voxelImage, 3);
     const int3 nodeImageDims = make_int3(nodeImage->nx, nodeImage->ny, nodeImage->nz);
     const int3 voxelImageDims = make_int3(voxelImage->nx, voxelImage->ny, voxelImage->nz);
-    auto voxelImageTexture = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
+    auto voxelImageTexturePtr = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
+    auto voxelImageTexture = *voxelImageTexturePtr;
 
     // The transformation between the image and the grid
     mat44 transformation;
@@ -69,15 +70,12 @@ void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
         weight *= ratio[i];
     }
 
-    const unsigned blocks = CudaContext::GetBlockSize()->VoxelCentricToNodeCentric;
-    const unsigned grids = (unsigned)Ceil(sqrtf((float)nodeNumber / (float)blocks));
-    const dim3 gridDims(grids, grids, 1);
-    const dim3 blockDims(blocks, 1, 1);
-    auto voxelCentricToNodeCentricKernel = is3d ? VoxelCentricToNodeCentricKernel<true> : VoxelCentricToNodeCentricKernel<false>;
-    voxelCentricToNodeCentricKernel<<<gridDims, blockDims>>>(nodeImageCuda, *voxelImageTexture, (unsigned)nodeNumber, nodeImageDims,
-                                                             voxelImageDims, weight, transformation, reorientation);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+    thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nodeNumber, [=]__device__(const int index) {
+        VoxelCentricToNodeCentricKernel<is3d>(nodeImageCuda, voxelImageTexture, nodeImageDims, voxelImageDims, weight, transformation, reorientation, index);
+    });
 }
+template void VoxelCentricToNodeCentric<false>(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*);
+template void VoxelCentricToNodeCentric<true>(const nifti_image*, const nifti_image*, float4*, float4*, float, const mat44*);
 /* *************************************************************** */
 void ConvertNmiGradientFromVoxelToRealSpace(const mat44 *sourceMatrixXYZ,
                                             const nifti_image *controlPointImage,
diff --git a/reg-lib/cuda/CudaTools.hpp b/reg-lib/cuda/CudaTools.hpp
index 14e68a24..8dfcbf6d 100644
--- a/reg-lib/cuda/CudaTools.hpp
+++ b/reg-lib/cuda/CudaTools.hpp
@@ -18,6 +18,7 @@
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
 /* *************************************************************** */
+template<bool is3d>
 void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
                                const nifti_image *voxelImage,
                                float4 *nodeImageCuda,
diff --git a/reg-lib/cuda/CudaToolsKernels.cu b/reg-lib/cuda/CudaToolsKernels.cu
index 54a415ba..fc38446e 100644
--- a/reg-lib/cuda/CudaToolsKernels.cu
+++ b/reg-lib/cuda/CudaToolsKernels.cu
@@ -14,18 +14,16 @@
 namespace NiftyReg::Cuda {
 /* *************************************************************** */
 template<bool is3d>
-__global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
+__device__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
                                                 cudaTextureObject_t voxelImageTexture,
-                                                const unsigned nodeNumber,
                                                 const int3 nodeImageDims,
                                                 const int3 voxelImageDims,
                                                 const float weight,
                                                 const mat44 transformation,
-                                                const mat33 reorientation) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid >= nodeNumber) return;
+                                                const mat33 reorientation,
+                                                const int index) {
     // Calculate the node coordinates
-    const auto [x, y, z] = reg_indexToDims_cuda<is3d>(tid, nodeImageDims);
+    const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, nodeImageDims);
     // Transform into voxel coordinates
     float voxelCoord[3], nodeCoord[3] = { static_cast<float>(x), static_cast<float>(y), static_cast<float>(z) };
     reg_mat44_mul_cuda<is3d>(transformation, nodeCoord, voxelCoord);
@@ -67,7 +65,7 @@ __global__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
 
     float reorientedValue[3];
     reg_mat33_mul_cuda<is3d>(reorientation, interpolatedValue, weight, reorientedValue);
-    nodeImageCuda[tid] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 };
+    nodeImageCuda[index] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 };
 }
 /* *************************************************************** */
 __global__ void ConvertNmiGradientFromVoxelToRealSpaceKernel(float4 *gradient, const mat44 matrix, const unsigned nodeNumber) {