From e568f9574fc5da690ec5616b2167c4f81b05a4c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Onur=20=C3=9Clgen?= <onur.ulgen@kcl.ac.uk>
Date: Tue, 5 Mar 2024 13:52:49 +0000
Subject: [PATCH] Refactor _reg_maths* and _reg_common_cuda_kernels

---
 niftyreg_build_version.txt                    |   2 +-
 reg-apps/reg_average.cpp                      |  21 +-
 reg-apps/reg_benchmark.cpp                    |   2 +-
 reg-apps/reg_gpuinfo.cpp                      |   2 +-
 reg-apps/reg_jacobian.cpp                     |   2 +-
 reg-apps/reg_resample.cpp                     |   2 +-
 reg-apps/reg_tools.cpp                        |   4 +-
 reg-apps/reg_transform.cpp                    |  35 +-
 reg-io/_reg_ReadWriteMatrix.cpp               |  10 +-
 reg-io/nrrd/reg_nrrd.cpp                      |   4 +-
 reg-lib/CMakeLists.txt                        |   3 +-
 reg-lib/Debug.hpp                             |   4 +-
 reg-lib/_reg_aladin.cpp                       |  10 +-
 reg-lib/_reg_aladin_sym.cpp                   |  11 +-
 reg-lib/_reg_base.cpp                         |   4 +-
 reg-lib/cl/ClAffineDeformationFieldKernel.cpp |   2 +-
 reg-lib/cpu/{_reg_maths.cpp => Maths.cpp}     | 559 +++++++-----------
 reg-lib/cpu/Maths.hpp                         | 363 ++++++++++++
 reg-lib/cpu/_reg_blockMatching.cpp            |  14 +-
 reg-lib/cpu/_reg_blockMatching.h              |   2 +-
 reg-lib/cpu/_reg_globalTrans.cpp              | 206 ++++---
 reg-lib/cpu/_reg_localTrans.cpp               |  77 ++-
 reg-lib/cpu/_reg_localTrans_jac.cpp           |  72 ++-
 reg-lib/cpu/_reg_localTrans_regul.cpp         |  23 +-
 reg-lib/cpu/_reg_maths.h                      | 200 -------
 reg-lib/cpu/_reg_maths_eigen.cpp              | 205 -------
 reg-lib/cpu/_reg_maths_eigen.h                |  36 --
 reg-lib/cpu/_reg_resampling.cpp               |  45 +-
 reg-lib/cpu/_reg_ssd.cpp                      |   4 +-
 reg-lib/cpu/_reg_tools.cpp                    |   8 +-
 reg-lib/cpu/_reg_tools.h                      |   2 +-
 reg-lib/cuda/CudaCommon.hpp                   |  16 +
 reg-lib/cuda/CudaGlobalTransformation.cu      |   7 +-
 reg-lib/cuda/CudaLocalTransformation.cu       |  20 +-
 .../cuda/CudaLocalTransformationKernels.cu    | 147 ++++-
 reg-lib/cuda/CudaOptimiser.cu                 |   1 -
 reg-lib/cuda/CudaResampling.cu                |  15 +-
 reg-lib/cuda/CudaTools.cu                     |  10 +-
 reg-lib/cuda/CudaToolsKernels.cu              |   8 +-
 reg-lib/cuda/_reg_common_cuda_kernels.cu      | 157 -----
 reg-lib/cuda/_reg_nmi_gpu.cu                  |   1 -
 reg-lib/cuda/affineDeformationKernel.cu       |   2 +-
 reg-lib/cuda/blockMatchingKernel.cu           |   2 +-
 reg-lib/cuda/resampleKernel.cu                |  14 +-
 reg-test/reg_test_affineDeformationField.cpp  |   6 +-
 reg-test/reg_test_be.cpp                      |   4 +-
 reg-test/reg_test_blockMatching.cpp           |   2 +-
 reg-test/reg_test_regr_lts.cpp                |   4 +-
 .../reg_test_voxelCentricToNodeCentric.cpp    |  12 +-
 49 files changed, 1026 insertions(+), 1336 deletions(-)
 rename reg-lib/cpu/{_reg_maths.cpp => Maths.cpp} (50%)
 create mode 100644 reg-lib/cpu/Maths.hpp
 delete mode 100644 reg-lib/cpu/_reg_maths.h
 delete mode 100644 reg-lib/cpu/_reg_maths_eigen.cpp
 delete mode 100644 reg-lib/cpu/_reg_maths_eigen.h
 delete mode 100644 reg-lib/cuda/_reg_common_cuda_kernels.cu

diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
index 29aae8ee..7b53aa00 100644
--- a/niftyreg_build_version.txt
+++ b/niftyreg_build_version.txt
@@ -1 +1 @@
-418
+419
diff --git a/reg-apps/reg_average.cpp b/reg-apps/reg_average.cpp
index 0b57a922..07446e4d 100644
--- a/reg-apps/reg_average.cpp
+++ b/reg-apps/reg_average.cpp
@@ -18,7 +18,6 @@
 #include "_reg_resampling.h"
 #include "_reg_globalTrans.h"
 #include "_reg_localTrans.h"
-#include "_reg_maths_eigen.h"
 
 using PrecisionType = float;
 
@@ -112,7 +111,7 @@ mat44 compute_average_matrices(size_t matrixNumber,
    // Input matrices are logged in place
    for(size_t m=0; m<matrixNumber; ++m)
    {
-      matrices[m] = reg_mat44_logm(&matrices[m]);
+      matrices[m] = Mat44Logm(&matrices[m]);
       matrixWeight[m]=1.;
    }
    // The number of iteration to perform is defined based on the use of lts
@@ -196,7 +195,7 @@ mat44 compute_average_matrices(size_t matrixNumber,
             matrixIndexSorted[m]=m;
          }
          // Sort the computed distances
-         reg_heapSort(matrixWeight, matrixIndexSorted, matrixNumber);
+         HeapSort(matrixWeight, matrixIndexSorted, matrixNumber);
          // Re-assign the weights for the next iteration
          memset(matrixWeight, 0, matrixNumber*sizeof(float));
          for(size_t m=matrixNumber-1; m>lts_inlier * matrixNumber; --m)
@@ -205,7 +204,7 @@ mat44 compute_average_matrices(size_t matrixNumber,
          }
       }
       // The average matrix is exponentiated
-      average_matrix = reg_mat44_expm(&average_matrix);
+      average_matrix = Mat44Expm(&average_matrix);
    } // iteration number
    // Free the allocated array
    free(matrixWeight);
@@ -230,15 +229,15 @@ mat44 compute_affine_demean(size_t matrixNumber,
       tempMatrix=nifti_quatern_to_mat44(qb,qc,qd,qx,qy,qz,1.f,1.f,1.f,qfac);
       // remove the rigid componenent from the affine matrix
       tempMatrix=nifti_mat44_inverse(tempMatrix);
-      tempMatrix=reg_mat44_mul(&tempMatrix,&current_affine);
+      tempMatrix=tempMatrix * current_affine;
       // sum up all the affine matrices
-      tempMatrix = reg_mat44_logm(&tempMatrix);
+      tempMatrix = Mat44Logm(&tempMatrix);
       demeanMatrix = demeanMatrix + tempMatrix;
    }
    // The average matrix is normalised
-   demeanMatrix = reg_mat44_mul(&demeanMatrix,1.f/(float)matrixNumber);
+   demeanMatrix = demeanMatrix * (1.f / (float)matrixNumber);
    // The average matrix is exponentiated
-   demeanMatrix = reg_mat44_expm(&demeanMatrix);
+   demeanMatrix = Mat44Expm(&demeanMatrix);
    // The average matrix is inverted
    demeanMatrix = nifti_mat44_inverse(demeanMatrix);
    return demeanMatrix;
@@ -293,11 +292,7 @@ int compute_nrr_demean(nifti_image *demean_field,
             affineTransformation=*reinterpret_cast<mat44 *>(transformation->ext_list[0].edata);
             // Note that if the transformation is a flow field, only half-of the affine has be used
             if(transformation->num_ext>1 && deformationField->intent_p1!=DEF_VEL_FIELD)
-            {
-               affineTransformation=reg_mat44_mul(
-                                       reinterpret_cast<mat44 *>(transformation->ext_list[1].edata),
-                                       &affineTransformation);
-            }
+               affineTransformation=reinterpret_cast<mat44&>(*transformation->ext_list[1].edata) * affineTransformation;
          }
          else reg_tool_ReadAffineFile(&affineTransformation,inputAffName[t]);
          // The affine component is substracted
diff --git a/reg-apps/reg_benchmark.cpp b/reg-apps/reg_benchmark.cpp
index 828b050e..dd439f62 100644
--- a/reg-apps/reg_benchmark.cpp
+++ b/reg-apps/reg_benchmark.cpp
@@ -153,7 +153,7 @@ int main(int argc, char **argv)
    originIndex[0] = -1.0f;
    originIndex[1] = -1.0f;
    originIndex[2] = -1.0f;
-   reg_mat44_mul(&(controlPointImage->qto_xyz), originIndex, originReal);
+   Mat44Mul(controlPointImage->qto_xyz, originIndex, originReal);
    controlPointImage->qto_xyz.m[0][3] = controlPointImage->qoffset_x = originReal[0];
    controlPointImage->qto_xyz.m[1][3] = controlPointImage->qoffset_y = originReal[1];
    controlPointImage->qto_xyz.m[2][3] = controlPointImage->qoffset_z = originReal[2];
diff --git a/reg-apps/reg_gpuinfo.cpp b/reg-apps/reg_gpuinfo.cpp
index d4858ead..3f051047 100644
--- a/reg-apps/reg_gpuinfo.cpp
+++ b/reg-apps/reg_gpuinfo.cpp
@@ -1,4 +1,4 @@
-#include "_reg_maths.h"
+#include "Maths.hpp"
 #include "Platform.h"
 
 #ifdef USE_CUDA
diff --git a/reg-apps/reg_jacobian.cpp b/reg-apps/reg_jacobian.cpp
index 27b517bf..06507407 100644
--- a/reg-apps/reg_jacobian.cpp
+++ b/reg-apps/reg_jacobian.cpp
@@ -233,7 +233,7 @@ int main(int argc, char **argv)
       if(!reg_isAnImageFileName(param->inputTransName)){
          mat44 *affineTransformation=(mat44 *)malloc(sizeof(mat44));
          reg_tool_ReadAffineFile(affineTransformation,param->inputTransName);
-         NR_COUT << reg_mat44_det<double>(affineTransformation) << std::endl;
+         NR_COUT << Mat44Det<double>(affineTransformation) << std::endl;
          return EXIT_SUCCESS;
       }
 
diff --git a/reg-apps/reg_resample.cpp b/reg-apps/reg_resample.cpp
index e2fe543d..9ab79df2 100755
--- a/reg-apps/reg_resample.cpp
+++ b/reg-apps/reg_resample.cpp
@@ -307,7 +307,7 @@ int main(int argc, char **argv)
    else
    {
       // No transformation is specified, an identity transformation is used
-      reg_mat44_eye(&inputAffineTransformation);
+      Mat44Eye(&inputAffineTransformation);
    }
 
    // Create a deformation field
diff --git a/reg-apps/reg_tools.cpp b/reg-apps/reg_tools.cpp
index 70ff5741..76b55ba5 100755
--- a/reg-apps/reg_tools.cpp
+++ b/reg-apps/reg_tools.cpp
@@ -498,7 +498,7 @@ int main(int argc, char **argv)
     {
         reg_tools_changeDatatype<float>(image);
         nifti_image *normImage = nifti_dup(*image);
-        reg_heapSort(static_cast<float *>(normImage->data), normImage->nvox);
+        HeapSort(static_cast<float *>(normImage->data), normImage->nvox);
         float minValue = static_cast<float *>(normImage->data)[Floor(03*(int)normImage->nvox/100)];
         float maxValue = static_cast<float *>(normImage->data)[Floor(97*(int)normImage->nvox/100)];
         reg_tools_subtractValueFromImage(image,normImage,minValue);
@@ -892,7 +892,7 @@ int main(int argc, char **argv)
         const size_t jacobianVoxelNumber = NiftiImage::calcVoxelNumber(def, 3);
         mat33 *jacobian = (mat33 *)malloc(jacobianVoxelNumber * sizeof(mat33));
         for (size_t i = 0; i < jacobianVoxelNumber; ++i)
-            reg_mat33_eye(&jacobian[i]);
+            Mat33Eye(&jacobian[i]);
         // resample the original image into the space of the new image
         if(flag->interpFlag == 0){
             param->interpOrder = 3;
diff --git a/reg-apps/reg_transform.cpp b/reg-apps/reg_transform.cpp
index 4cf0bfe5..ba427d31 100755
--- a/reg-apps/reg_transform.cpp
+++ b/reg-apps/reg_transform.cpp
@@ -16,7 +16,6 @@
 #include "_reg_globalTrans.h"
 #include "_reg_localTrans.h"
 #include "_reg_tools.h"
-#include "_reg_maths_eigen.h"
 
 #include "reg_transform.h"
 
@@ -532,7 +531,7 @@ int main(int argc, char **argv) {
         if (affine1Trans != nullptr && affine2Trans != nullptr) {
             NR_INFO("Transformation 2 is an affine parametrisation:");
             NR_INFO(param->input2TransName);
-            *affine1Trans = reg_mat44_mul(affine2Trans, affine1Trans);
+            *affine1Trans = *affine2Trans * *affine1Trans;
             reg_tool_WriteAffineFile(affine1Trans, param->outputTransName);
         } else {
             // Check if the reference image is required
@@ -955,10 +954,10 @@ int main(int argc, char **argv) {
 
         // Update the sform
         if (image->sform_code > 0) {
-            image->sto_xyz = reg_mat44_mul(affineTransformation, &(image->sto_xyz));
+            image->sto_xyz = *affineTransformation * image->sto_xyz;
         } else {
             image->sform_code = 1;
-            image->sto_xyz = reg_mat44_mul(affineTransformation, &(image->qto_xyz));
+            image->sto_xyz = *affineTransformation * image->qto_xyz;
         }
         image->sto_ijk = nifti_mat44_inverse(image->sto_xyz);
 
@@ -980,9 +979,9 @@ int main(int argc, char **argv) {
             affineTrans = (mat44 *)malloc(sizeof(mat44));
             reg_tool_ReadAffineFile(affineTrans, param->inputTransName);
             // The affine transformation is halfed
-            *affineTrans = reg_mat44_logm(affineTrans);
-            *affineTrans = reg_mat44_mul(affineTrans, 0.5);
-            *affineTrans = reg_mat44_expm(affineTrans);
+            *affineTrans = Mat44Logm(affineTrans);
+            *affineTrans = *affineTrans * 0.5;
+            *affineTrans = Mat44Expm(affineTrans);
             // The affine transformation is saved
             reg_tool_WriteAffineFile(affineTrans, param->outputTransName);
         } else {
@@ -1183,17 +1182,17 @@ int main(int argc, char **argv) {
     if (flag->makeAffFlag) {
         // Create all the required matrices
         mat44 rotationX;
-        reg_mat44_eye(&rotationX);
+        Mat44Eye(&rotationX);
         mat44 translation;
-        reg_mat44_eye(&translation);
+        Mat44Eye(&translation);
         mat44 rotationY;
-        reg_mat44_eye(&rotationY);
+        Mat44Eye(&rotationY);
         mat44 rotationZ;
-        reg_mat44_eye(&rotationZ);
+        Mat44Eye(&rotationZ);
         mat44 scaling;
-        reg_mat44_eye(&scaling);
+        Mat44Eye(&scaling);
         mat44 shearing;
-        reg_mat44_eye(&shearing);
+        Mat44Eye(&shearing);
         // Set up the rotation matrix along the YZ plane
         rotationX.m[1][1] = cosf(param->affTransParam[0]);
         rotationX.m[1][2] = -sinf(param->affTransParam[0]);
@@ -1222,11 +1221,11 @@ int main(int argc, char **argv) {
         shearing.m[2][0] = param->affTransParam[10];
         shearing.m[2][1] = param->affTransParam[11];
         // Combine all the transformations
-        mat44 affine = reg_mat44_mul(&rotationY, &rotationZ);
-        affine = reg_mat44_mul(&rotationX, &affine);
-        affine = reg_mat44_mul(&scaling, &affine);
-        affine = reg_mat44_mul(&shearing, &affine);
-        affine = reg_mat44_mul(&translation, &affine);
+        mat44 affine = rotationY * rotationZ;
+        affine = rotationX * affine;
+        affine = scaling * affine;
+        affine = shearing * affine;
+        affine = translation * affine;
         // Save the new matrix
         reg_tool_WriteAffineFile(&affine, param->outputTransName);
     }
diff --git a/reg-io/_reg_ReadWriteMatrix.cpp b/reg-io/_reg_ReadWriteMatrix.cpp
index 8b399680..e37fc34c 100644
--- a/reg-io/_reg_ReadWriteMatrix.cpp
+++ b/reg-io/_reg_ReadWriteMatrix.cpp
@@ -70,11 +70,11 @@ void reg_tool_ReadAffineFile(mat44 *mat,
         absoluteFloating = nifti_mat44_inverse(absoluteFloating);
         *mat = nifti_mat44_inverse(*mat);
 
-        *mat = reg_mat44_mul(&absoluteFloating, mat);
-        *mat = reg_mat44_mul(mat, &absoluteReference);
-        *mat = reg_mat44_mul(floatingMatrix, mat);
+        *mat = absoluteFloating * *mat;
+        *mat = *mat * absoluteReference;
+        *mat = *floatingMatrix * *mat;
         mat44 tmp = nifti_mat44_inverse(*referenceMatrix);
-        *mat = reg_mat44_mul(mat, &tmp);
+        *mat = *mat * tmp;
     }
 
     NR_MAT44_DEBUG(*mat, "Affine matrix");
@@ -168,7 +168,7 @@ template<class T>
 T** reg_tool_ReadMatrixFile(char *filename, size_t nbLine, size_t nbColumn) {
     //THEN CONSTRUCT THE MATRIX
     // Allocate the matrices
-    T** mat = reg_matrix2DAllocate<T>(nbLine, nbColumn);
+    T** mat = Matrix2dAlloc<T>(nbLine, nbColumn);
     //STORE THE VALUES
     std::string line;
     std::ifstream matrixFile(filename);
diff --git a/reg-io/nrrd/reg_nrrd.cpp b/reg-io/nrrd/reg_nrrd.cpp
index 225d6f11..9462a634 100644
--- a/reg-io/nrrd/reg_nrrd.cpp
+++ b/reg-io/nrrd/reg_nrrd.cpp
@@ -167,7 +167,7 @@ nifti_image *reg_io_nrdd2nifti(Nrrd *nrrdImage)
 
    // The space orientation is extracted and converted into a matrix
    mat44 qform_orientation_matrix;
-   reg_mat44_eye(&qform_orientation_matrix);
+   Mat44Eye(&qform_orientation_matrix);
    if(nrrdImage->space==nrrdSpaceRightAnteriorSuperior ||
          nrrdImage->space==nrrdSpaceRightAnteriorSuperiorTime ||
          nrrdImage->space==nrrdSpace3DRightHanded ||
@@ -251,7 +251,7 @@ nifti_image *reg_io_nrdd2nifti(Nrrd *nrrdImage)
    if(nrrdImage->axis[1].spaceDirection[0]!=std::numeric_limits<double>::quiet_NaN())
    {
       niiImage->sform_code=1;
-      reg_mat44_eye(&niiImage->sto_xyz);
+      Mat44Eye(&niiImage->sto_xyz);
       for(int i=0; i<(niiImage->ndim<3?niiImage->ndim:3); ++i)
       {
          for(int j=0; j<(niiImage->ndim<3?niiImage->ndim:3); ++j)
diff --git a/reg-lib/CMakeLists.txt b/reg-lib/CMakeLists.txt
index ac7a34b1..039e26ee 100755
--- a/reg-lib/CMakeLists.txt
+++ b/reg-lib/CMakeLists.txt
@@ -12,8 +12,7 @@ endif(USE_OPENCL)
 ##BUILD THE CPU LIBRARIES
 #-----------------------------------------------------------------------------
 add_library(_reg_maths ${NIFTYREG_LIBRARY_TYPE}
-  cpu/_reg_maths.cpp
-  cpu/_reg_maths_eigen.cpp
+  cpu/Maths.cpp
 )
 install(TARGETS _reg_maths
   RUNTIME DESTINATION bin
diff --git a/reg-lib/Debug.hpp b/reg-lib/Debug.hpp
index 93e452eb..826b13ed 100644
--- a/reg-lib/Debug.hpp
+++ b/reg-lib/Debug.hpp
@@ -68,11 +68,11 @@ inline std::string StripFunctionName(const std::string& funcName) {
 #define NR_INFO(msg)        NR_COUT << "[NiftyReg INFO] " << msg << std::endl
 /* *************************************************************** */
 #ifndef NDEBUG
-#define NR_MAT44(mat, title)          reg_mat44_disp(mat, "[NiftyReg DEBUG] "s + (title))
+#define NR_MAT44(mat, title)          Mat44Disp(mat, "[NiftyReg DEBUG] "s + (title))
 #define NR_MAT44_DEBUG(mat, title)    NR_MAT44(mat, title)
 #define NR_MAT44_VERBOSE(mat, title)  NR_MAT44(mat, title)
 #else
-#define NR_MAT44(mat, title)          reg_mat44_disp(mat, title)
+#define NR_MAT44(mat, title)          Mat44Disp(mat, title)
 #define NR_MAT44_DEBUG(mat, title)
 #define NR_MAT44_VERBOSE(mat, title)  if (this->verbose) NR_MAT44(mat, "[NiftyReg INFO] "s + (title))
 #endif
diff --git a/reg-lib/_reg_aladin.cpp b/reg-lib/_reg_aladin.cpp
index 35b5a2dd..032aeb97 100644
--- a/reg-lib/_reg_aladin.cpp
+++ b/reg-lib/_reg_aladin.cpp
@@ -177,9 +177,9 @@ void reg_aladin<T>::InitialiseRegistration() {
             referenceCenter[2] = (float)(this->inputReference->nz) / 2.0f;
             //From pixel coordinates to real coordinates
             float floatingRealPosition[3];
-            reg_mat44_mul(floatingMatrix, floatingCenter, floatingRealPosition);
+            Mat44Mul(*floatingMatrix, floatingCenter, floatingRealPosition);
             float referenceRealPosition[3];
-            reg_mat44_mul(referenceMatrix, referenceCenter, referenceRealPosition);
+            Mat44Mul(*referenceMatrix, referenceCenter, referenceRealPosition);
             //Set translation to the transformation matrix
             this->affineTransformation->m[0][3] = floatingRealPosition[0] - referenceRealPosition[0];
             this->affineTransformation->m[1][3] = floatingRealPosition[1] - referenceRealPosition[1];
@@ -207,7 +207,7 @@ void reg_aladin<T>::InitialiseRegistration() {
             referenceCentre[2] /= referenceCount;
             float refCOM[3]{};
             if (this->inputReference->sform_code > 0)
-                reg_mat44_mul(&this->inputReference->sto_xyz, referenceCentre, refCOM);
+                Mat44Mul(this->inputReference->sto_xyz, referenceCentre, refCOM);
 
             float floatingCentre[3] = { 0, 0, 0 };
             float floatingCount = 0;
@@ -231,8 +231,8 @@ void reg_aladin<T>::InitialiseRegistration() {
             floatingCentre[2] /= floatingCount;
             float floCOM[3]{};
             if (this->inputFloating->sform_code > 0)
-                reg_mat44_mul(&this->inputFloating->sto_xyz, floatingCentre, floCOM);
-            reg_mat44_eye(this->affineTransformation.get());
+                Mat44Mul(this->inputFloating->sto_xyz, floatingCentre, floCOM);
+            Mat44Eye(this->affineTransformation.get());
             this->affineTransformation->m[0][3] = floCOM[0] - refCOM[0];
             this->affineTransformation->m[1][3] = floCOM[1] - refCOM[1];
             this->affineTransformation->m[2][3] = floCOM[2] - refCOM[2];
diff --git a/reg-lib/_reg_aladin_sym.cpp b/reg-lib/_reg_aladin_sym.cpp
index 610405bd..62cdd753 100644
--- a/reg-lib/_reg_aladin_sym.cpp
+++ b/reg-lib/_reg_aladin_sym.cpp
@@ -1,5 +1,4 @@
 #include "_reg_aladin_sym.h"
-#include "_reg_maths_eigen.h"
 
 /* *************************************************************** */
 template <class T>
@@ -81,7 +80,7 @@ void reg_aladin_sym<T>::InitialiseRegistration() {
         referenceCentre[2] /= referenceCount;
         float refCOG[3]{};
         if (this->inputReference->sform_code > 0)
-            reg_mat44_mul(&(this->inputReference->sto_xyz), referenceCentre, refCOG);
+            Mat44Mul(this->inputReference->sto_xyz, referenceCentre, refCOG);
 
         float floatingCentre[3] = { 0, 0, 0 };
         float floatingCount = 0;
@@ -106,8 +105,8 @@ void reg_aladin_sym<T>::InitialiseRegistration() {
         floatingCentre[2] /= floatingCount;
         float floCOG[3]{};
         if (this->inputFloating->sform_code > 0)
-            reg_mat44_mul(&(this->inputFloating->sto_xyz), floatingCentre, floCOG);
-        reg_mat44_eye(this->affineTransformation.get());
+            Mat44Mul(this->inputFloating->sto_xyz, floatingCentre, floCOG);
+        Mat44Eye(this->affineTransformation.get());
         this->affineTransformation->m[0][3] = floCOG[0] - refCOG[0];
         this->affineTransformation->m[1][3] = floCOG[1] - refCOG[1];
         this->affineTransformation->m[2][3] = floCOG[2] - refCOG[2];
@@ -143,9 +142,9 @@ void reg_aladin_sym<T>::UpdateTransformationMatrix(int type) {
     mat44 bInverted = nifti_mat44_inverse(*this->affineTransformationBw);
 
     // We average the forward and inverted backward matrix
-    *this->affineTransformation = reg_mat44_avg2(this->affineTransformation.get(), &bInverted);
+    *this->affineTransformation = Mat44Avg2(this->affineTransformation.get(), &bInverted);
     // We average the inverted forward and backward matrix
-    *this->affineTransformationBw = reg_mat44_avg2(&fInverted, this->affineTransformationBw.get());
+    *this->affineTransformationBw = Mat44Avg2(&fInverted, this->affineTransformationBw.get());
     for (int i = 0; i < 3; ++i) {
         this->affineTransformation->m[3][i] = 0.f;
         this->affineTransformationBw->m[3][i] = 0.f;
diff --git a/reg-lib/_reg_base.cpp b/reg-lib/_reg_base.cpp
index cc5e8be5..4eb441ef 100644
--- a/reg-lib/_reg_base.cpp
+++ b/reg-lib/_reg_base.cpp
@@ -402,7 +402,7 @@ void reg_base<T>::Initialise() {
         reg_tools_changeDatatype<T>(tmpReference);
         // Extract the robust range of the reference image
         T *refDataPtr = static_cast<T *>(tmpReference->data);
-        reg_heapSort(refDataPtr, tmpReference->nvox);
+        HeapSort(refDataPtr, tmpReference->nvox);
         // Update the reference threshold values if no value has been setup by the user
         if (referenceThresholdLow[0] == std::numeric_limits<T>::lowest())
             referenceThresholdLow[0] = refDataPtr[Round((float)tmpReference->nvox * 0.02f)];
@@ -414,7 +414,7 @@ void reg_base<T>::Initialise() {
         reg_tools_changeDatatype<T>(tmpFloating);
         // Extract the robust range of the floating image
         T *floDataPtr = static_cast<T *>(tmpFloating->data);
-        reg_heapSort(floDataPtr, tmpFloating->nvox);
+        HeapSort(floDataPtr, tmpFloating->nvox);
         // Update the floating threshold values if no value has been setup by the user
         if (floatingThresholdLow[0] == std::numeric_limits<T>::lowest())
             floatingThresholdLow[0] = floDataPtr[Round((float)tmpFloating->nvox * 0.02f)];
diff --git a/reg-lib/cl/ClAffineDeformationFieldKernel.cpp b/reg-lib/cl/ClAffineDeformationFieldKernel.cpp
index 073fcaa6..8314a51f 100644
--- a/reg-lib/cl/ClAffineDeformationFieldKernel.cpp
+++ b/reg-lib/cl/ClAffineDeformationFieldKernel.cpp
@@ -89,7 +89,7 @@ void ClAffineDeformationFieldKernel::Calculate(bool compose) {
     const size_t globalWorkSize[dims] = {xBlocks * xThreads, yBlocks * yThreads, zBlocks * zThreads};
     const size_t localWorkSize[dims] = {xThreads, yThreads, zThreads};
 
-    mat44 transformationMatrix = compose ? *affineTransformation : reg_mat44_mul(affineTransformation, referenceMatrix);
+    mat44 transformationMatrix = compose ? *affineTransformation : *affineTransformation * *referenceMatrix;
 
     float* trans = (float *)malloc(16 * sizeof(float));
     mat44ToCptr(transformationMatrix, trans);
diff --git a/reg-lib/cpu/_reg_maths.cpp b/reg-lib/cpu/Maths.cpp
similarity index 50%
rename from reg-lib/cpu/_reg_maths.cpp
rename to reg-lib/cpu/Maths.cpp
index 19ed9210..6abb2f72 100644
--- a/reg-lib/cpu/_reg_maths.cpp
+++ b/reg-lib/cpu/Maths.cpp
@@ -1,86 +1,29 @@
-#include "_reg_tools.h"
+#define USE_EIGEN
 
-#define mat(i,j,dim) mat[i*dim+j]
+#include "_reg_tools.h"
+#include "Eigen/Core"
+#include "unsupported/Eigen/MatrixFunctions"
 
 /* *************************************************************** */
-template<class T>
-T* reg_matrix1DAllocate(size_t arraySize) {
-    T* res = (T*)malloc(arraySize * sizeof(T));
-    return res;
-}
-template bool* reg_matrix1DAllocate<bool>(size_t arraySize);
-template float* reg_matrix1DAllocate<float>(size_t arraySize);
-template double* reg_matrix1DAllocate<double>(size_t arraySize);
-/* *************************************************************** */
-template<class T>
-void reg_matrix1DDeallocate(T* mat) {
-    free(mat);
-}
-template void reg_matrix1DDeallocate<bool>(bool* mat);
-template void reg_matrix1DDeallocate<float>(float* mat);
-template void reg_matrix1DDeallocate<double>(double* mat);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DAllocate(size_t arraySizeX, size_t arraySizeY) {
-    T** res;
-    res = (T**)malloc(arraySizeX * sizeof(T*));
-    for (size_t i = 0; i < arraySizeX; i++) {
-        res[i] = (T*)malloc(arraySizeY * sizeof(T));
-    }
-    return res;
-}
-template float** reg_matrix2DAllocate<float>(size_t arraySizeX, size_t arraySizeY);
-template double** reg_matrix2DAllocate<double>(size_t arraySizeX, size_t arraySizeY);
+namespace NiftyReg {
 /* *************************************************************** */
 template<class T>
-void reg_matrix2DDeallocate(size_t arraySizeX, T** mat) {
-    for (size_t i = 0; i < arraySizeX; i++) {
-        free(mat[i]);
-    }
-    free(mat);
-}
-template void reg_matrix2DDeallocate<float>(size_t arraySizeX, float** mat);
-template void reg_matrix2DDeallocate<double>(size_t arraySizeX, double** mat);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DTranspose(T** mat, size_t arraySizeX, size_t arraySizeY) {
-    T** res;
-    res = (T**)malloc(arraySizeY * sizeof(T*));
-    for (size_t i = 0; i < arraySizeY; i++) {
-        res[i] = (T*)malloc(arraySizeX * sizeof(T));
-    }
-    for (size_t i = 0; i < arraySizeX; i++) {
-        for (size_t j = 0; j < arraySizeY; j++) {
-            res[j][i] = mat[i][j];
-        }
-    }
-    return res;
-}
-template float** reg_matrix2DTranspose<float>(float** mat, size_t arraySizeX, size_t arraySizeY);
-template double** reg_matrix2DTranspose<double>(double** mat, size_t arraySizeX, size_t arraySizeY);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2) {
+T** Matrix2dMultiply(T **mat1, const size_t mat1X, const size_t mat1Y, T **mat2, const size_t mat2X, const size_t mat2Y, const bool transposeMat2) {
     if (transposeMat2 == false) {
         // First check that the dimension are appropriate
         if (mat1Y != mat2X)
             NR_FATAL_ERROR("Matrices can not be multiplied due to their size: [" + std::to_string(mat1X) + " " +
                            std::to_string(mat1Y) + "] [" + std::to_string(mat2X) + " " + std::to_string(mat2Y) + "]");
 
-        size_t nbElement = mat1Y;
-        double resTemp = 0;
-        T** res = reg_matrix2DAllocate<T>(mat1X, mat2Y);
-
+        T **res = Matrix2dAlloc<T>(mat1X, mat2Y);
         for (size_t i = 0; i < mat1X; i++) {
             for (size_t j = 0; j < mat2Y; j++) {
-                resTemp = 0;
-                for (size_t k = 0; k < nbElement; k++) {
+                double resTemp = 0;
+                for (size_t k = 0; k < mat1Y; k++)
                     resTemp += static_cast<double>(mat1[i][k]) * static_cast<double>(mat2[k][j]);
-                }
                 res[i][j] = static_cast<T>(resTemp);
             }
         }
-        //Output
         return res;
     } else {
         // First check that the dimension are appropriate
@@ -88,43 +31,34 @@ T** reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t
             NR_FATAL_ERROR("Matrices can not be multiplied due to their size: [" + std::to_string(mat1X) + " " +
                            std::to_string(mat1Y) + "] [" + std::to_string(mat2Y) + " " + std::to_string(mat2X) + "]");
 
-        size_t nbElement = mat1Y;
-        double resTemp = 0;
-        T** res = reg_matrix2DAllocate<T>(mat1X, mat2X);
-
+        T **res = Matrix2dAlloc<T>(mat1X, mat2X);
         for (size_t i = 0; i < mat1X; i++) {
             for (size_t j = 0; j < mat2X; j++) {
-                resTemp = 0;
-                for (size_t k = 0; k < nbElement; k++) {
+                double resTemp = 0;
+                for (size_t k = 0; k < mat1Y; k++)
                     resTemp += static_cast<double>(mat1[i][k]) * static_cast<double>(mat2[j][k]);
-                }
                 res[i][j] = static_cast<T>(resTemp);
             }
         }
-        //Output
         return res;
     }
 }
-template float** reg_matrix2DMultiply<float>(float** mat1, size_t mat1X, size_t mat1Y, float** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2);
-template double** reg_matrix2DMultiply<double>(double** mat1, size_t mat1X, size_t mat1Y, double** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2);
+template float** Matrix2dMultiply<float>(float** mat1, const size_t mat1X, const size_t mat1Y, float** mat2, const size_t mat2X, const size_t mat2Y, const bool transposeMat2);
+template double** Matrix2dMultiply<double>(double** mat1, const size_t mat1X, const size_t mat1Y, double** mat2, const size_t mat2X, const size_t mat2Y, const bool transposeMat2);
 /* *************************************************************** */
 template<class T>
-void reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, T** resT, bool transposeMat2) {
+void Matrix2dMultiply(T **mat1, const size_t mat1X, const size_t mat1Y, T **mat2, const size_t mat2X, const size_t mat2Y, T **resT, const bool transposeMat2) {
     if (transposeMat2 == false) {
         // First check that the dimension are appropriate
         if (mat1Y != mat2X)
             NR_FATAL_ERROR("Matrices can not be multiplied due to their size: [" + std::to_string(mat1X) + " " +
                            std::to_string(mat1Y) + "] [" + std::to_string(mat2X) + " " + std::to_string(mat2Y) + "]");
 
-        size_t nbElement = mat1Y;
-        double resTemp;
-
         for (size_t i = 0; i < mat1X; i++) {
             for (size_t j = 0; j < mat2Y; j++) {
-                resTemp = 0;
-                for (size_t k = 0; k < nbElement; k++) {
+                double resTemp = 0;
+                for (size_t k = 0; k < mat1Y; k++)
                     resTemp += static_cast<double>(mat1[i][k]) * static_cast<double>(mat2[k][j]);
-                }
                 resT[i][j] = static_cast<T>(resTemp);
             }
         }
@@ -134,27 +68,23 @@ void reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t
             NR_FATAL_ERROR("Matrices can not be multiplied due to their size: [" + std::to_string(mat1X) + " " +
                            std::to_string(mat1Y) + "] [" + std::to_string(mat2Y) + " " + std::to_string(mat2X) + "]");
 
-        size_t nbElement = mat1Y;
-        double resTemp;
-
         for (size_t i = 0; i < mat1X; i++) {
             for (size_t j = 0; j < mat2X; j++) {
-                resTemp = 0;
-                for (size_t k = 0; k < nbElement; k++) {
+                double resTemp = 0;
+                for (size_t k = 0; k < mat1Y; k++)
                     resTemp += static_cast<double>(mat1[i][k]) * static_cast<double>(mat2[j][k]);
-                }
                 resT[i][j] = static_cast<T>(resTemp);
             }
         }
     }
 }
-template void reg_matrix2DMultiply<float>(float** mat1, size_t mat1X, size_t mat1Y, float** mat2, size_t mat2X, size_t mat2Y, float** resT, bool transposeMat2);
-template void reg_matrix2DMultiply<double>(double** mat1, size_t mat1X, size_t mat1Y, double** mat2, size_t mat2X, size_t mat2Y, double** resT, bool transposeMat2);
+template void Matrix2dMultiply<float>(float** mat1, const size_t mat1X, const size_t mat1Y, float** mat2, const size_t mat2X, const size_t mat2Y, float** resT, const bool transposeMat2);
+template void Matrix2dMultiply<double>(double** mat1, const size_t mat1X, const size_t mat1Y, double** mat2, const size_t mat2X, const size_t mat2Y, double** resT, const bool transposeMat2);
 /* *************************************************************** */
 // Multiply a matrix with a vector - we assume correct dimension
 template<class T>
-T* reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect) {
-    T* res = reg_matrix1DAllocate<T>(m);
+T* Matrix2dVectorMultiply(T **mat, const size_t m, const size_t n, T* vect) {
+    T* res = Matrix1dAlloc<T>(m);
     for (size_t i = 0; i < m; i++) {
         double resTemp = 0;
         for (size_t k = 0; k < n; k++) {
@@ -164,11 +94,11 @@ T* reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect) {
     }
     return res;
 }
-template float* reg_matrix2DVectorMultiply<float>(float** mat, size_t m, size_t n, float* vect);
-template double* reg_matrix2DVectorMultiply<double>(double** mat, size_t m, size_t n, double* vect);
+template float* Matrix2dVectorMultiply<float>(float** mat, const size_t m, const size_t n, float* vect);
+template double* Matrix2dVectorMultiply<double>(double** mat, const size_t m, const size_t n, double* vect);
 /* *************************************************************** */
 template<class T>
-void reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect, T* res) {
+void Matrix2dVectorMultiply(T **mat, const size_t m, const size_t n, T* vect, T* res) {
     for (size_t i = 0; i < m; i++) {
         double resTemp = 0;
         for (size_t k = 0; k < n; k++) {
@@ -177,11 +107,11 @@ void reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect, T* res) {
         res[i] = static_cast<T>(resTemp);
     }
 }
-template void reg_matrix2DVectorMultiply<float>(float** mat, size_t m, size_t n, float* vect, float* res);
-template void reg_matrix2DVectorMultiply<double>(double** mat, size_t m, size_t n, double* vect, double* res);
+template void Matrix2dVectorMultiply<float>(float** mat, const size_t m, const size_t n, float* vect, float* res);
+template void Matrix2dVectorMultiply<double>(double** mat, const size_t m, const size_t n, double* vect, double* res);
 /* *************************************************************** */
 // Heap sort
-void reg_heapSort(float *array_tmp, int *index_tmp, int blockNum) {
+void HeapSort(float *array_tmp, int *index_tmp, int blockNum) {
     float *array = &array_tmp[-1];
     int *index = &index_tmp[-1];
     int l = (blockNum >> 1) + 1;
@@ -223,7 +153,7 @@ void reg_heapSort(float *array_tmp, int *index_tmp, int blockNum) {
 /* *************************************************************** */
 // Heap sort
 template<class DataType>
-void reg_heapSort(DataType *array_tmp, int blockNum) {
+void HeapSort(DataType *array_tmp, int blockNum) {
     DataType *array = &array_tmp[-1];
     int l = (blockNum >> 1) + 1;
     int ir = blockNum;
@@ -254,31 +184,11 @@ void reg_heapSort(DataType *array_tmp, int blockNum) {
         array[i] = val;
     }
 }
-template void reg_heapSort<float>(float *array_tmp, int blockNum);
-template void reg_heapSort<double>(double *array_tmp, int blockNum);
-/* *************************************************************** */
-bool operator==(mat44 A, mat44 B) {
-    for (unsigned i = 0; i < 4; ++i) {
-        for (unsigned j = 0; j < 4; ++j) {
-            if (A.m[i][j] != B.m[i][j])
-                return false;
-        }
-    }
-    return true;
-}
-/* *************************************************************** */
-bool operator!=(mat44 A, mat44 B) {
-    for (unsigned i = 0; i < 4; ++i) {
-        for (unsigned j = 0; j < 4; ++j) {
-            if (A.m[i][j] != B.m[i][j])
-                return true;
-        }
-    }
-    return false;
-}
+template void HeapSort<float>(float *array_tmp, int blockNum);
+template void HeapSort<double>(double *array_tmp, int blockNum);
 /* *************************************************************** */
 template<class T>
-T reg_mat44_det(mat44 const* A) {
+T Mat44Det(const mat44 *A) {
     double D =
         static_cast<double>(A->m[0][0]) * static_cast<double>(A->m[1][1]) * static_cast<double>(A->m[2][2]) * static_cast<double>(A->m[3][3])
         - static_cast<double>(A->m[0][0]) * static_cast<double>(A->m[1][1]) * static_cast<double>(A->m[3][2]) * static_cast<double>(A->m[2][3])
@@ -306,129 +216,10 @@ T reg_mat44_det(mat44 const* A) {
         + static_cast<double>(A->m[3][0]) * static_cast<double>(A->m[2][1]) * static_cast<double>(A->m[1][2]) * static_cast<double>(A->m[0][3]);
     return static_cast<T>(D);
 }
-template float reg_mat44_det<float>(mat44 const* A);
-template double reg_mat44_det<double>(mat44 const* A);
-/* *************************************************************** */
-void reg_mat33_to_nan(mat33 *A) {
-    for (int i = 0; i < 3; ++i)
-        for (int j = 0; j < 3; ++j)
-            A->m[i][j] = std::numeric_limits<float>::quiet_NaN();
-}
-/* *************************************************************** */
-mat33 reg_mat44_to_mat33(mat44 const* A) {
-    mat33 out;
-    out.m[0][0] = A->m[0][0];
-    out.m[0][1] = A->m[0][1];
-    out.m[0][2] = A->m[0][2];
-    out.m[1][0] = A->m[1][0];
-    out.m[1][1] = A->m[1][1];
-    out.m[1][2] = A->m[1][2];
-    out.m[2][0] = A->m[2][0];
-    out.m[2][1] = A->m[2][1];
-    out.m[2][2] = A->m[2][2];
-    return out;
-}
-/* *************************************************************** */
-mat44 reg_mat44_mul(mat44 const* A, mat44 const* B) {
-    mat44 R;
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][0]) * static_cast<double>(B->m[0][j]) +
-                                           static_cast<double>(A->m[i][1]) * static_cast<double>(B->m[1][j]) +
-                                           static_cast<double>(A->m[i][2]) * static_cast<double>(B->m[2][j]) +
-                                           static_cast<double>(A->m[i][3]) * static_cast<double>(B->m[3][j]));
-        }
-    }
-    return R;
-}
-/* *************************************************************** */
-mat44 operator*(mat44 A, mat44 B) {
-    return reg_mat44_mul(&A, &B);
-}
-/* *************************************************************** */
-void reg_mat33_mul(mat44 const* mat, float const* in, float *out) {
-    out[0] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat->m[0][0]) +
-                                static_cast<double>(in[1]) * static_cast<double>(mat->m[0][1]) +
-                                static_cast<double>(mat->m[0][3]));
-    out[1] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat->m[1][0]) +
-                                static_cast<double>(in[1]) * static_cast<double>(mat->m[1][1]) +
-                                static_cast<double>(mat->m[1][3]));
-}
-/* *************************************************************** */
-void reg_mat33_mul(mat33 const* mat, float const* in, float *out) {
-    out[0] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat->m[0][0]) +
-                                static_cast<double>(in[1]) * static_cast<double>(mat->m[0][1]) +
-                                static_cast<double>(mat->m[0][2]));
-    out[1] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat->m[1][0]) +
-                                static_cast<double>(in[1]) * static_cast<double>(mat->m[1][1]) +
-                                static_cast<double>(mat->m[1][2]));
-}
-/* *************************************************************** */
-mat33 reg_mat33_mul(mat33 const* A, mat33 const* B) {
-    mat33 R;
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][0]) * static_cast<double>(B->m[0][j]) +
-                                           static_cast<double>(A->m[i][1]) * static_cast<double>(B->m[1][j]) +
-                                           static_cast<double>(A->m[i][2]) * static_cast<double>(B->m[2][j]));
-        }
-    }
-    return R;
-}
-/* *************************************************************** */
-mat33 operator*(mat33 A, mat33 B) {
-    return reg_mat33_mul(&A, &B);
-}
-/* *************************************************************** */
-mat33 reg_mat33_add(mat33 const* A, mat33 const* B) {
-    mat33 R;
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][j]) + static_cast<double>(B->m[i][j]));
-        }
-    }
-    return R;
-}
+template float Mat44Det<float>(const mat44 *A);
+template double Mat44Det<double>(const mat44 *A);
 /* *************************************************************** */
-mat33 reg_mat33_trans(mat33 A) {
-    mat33 R;
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            R.m[j][i] = A.m[i][j];
-        }
-    }
-    return R;
-}
-/* *************************************************************** */
-mat33 operator+(mat33 A, mat33 B) {
-    return reg_mat33_add(&A, &B);
-}
-/* *************************************************************** */
-mat44 reg_mat44_add(mat44 const* A, mat44 const* B) {
-    mat44 R;
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][j]) + static_cast<double>(B->m[i][j]));
-        }
-    }
-    return R;
-}
-/* *************************************************************** */
-mat44 operator+(mat44 A, mat44 B) {
-    return reg_mat44_add(&A, &B);
-}
-/* *************************************************************** */
-mat33 reg_mat33_minus(mat33 const* A, mat33 const* B) {
-    mat33 R;
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][j]) - static_cast<double>(B->m[i][j]));
-        }
-    }
-    return R;
-}
-/* *************************************************************** */
-void reg_mat33_diagonalize(mat33 const* A, mat33 * Q, mat33 * D) {
+void Mat33Diagonalize(const mat33 *A, mat33 *Q, mat33 *D) {
     // A must be a symmetric matrix.
     // returns Q and D such that
     // Diagonal matrix D = QT * A * Q;  and  A = Q*D*QT
@@ -522,126 +313,188 @@ void reg_mat33_diagonalize(mat33 const* A, mat33 * Q, mat33 * D) {
         q[3] /= mq;
     }
 }
-
-/* *************************************************************** */
-mat33 operator-(mat33 A, mat33 B) {
-    return reg_mat33_minus(&A, &B);
-}
 /* *************************************************************** */
-void reg_mat33_eye(mat33 *mat) {
-    mat->m[0][0] = 1.f;
-    mat->m[0][1] = mat->m[0][2] = 0.f;
-    mat->m[1][1] = 1.f;
-    mat->m[1][0] = mat->m[1][2] = 0.f;
-    mat->m[2][2] = 1.f;
-    mat->m[2][0] = mat->m[2][1] = 0.f;
+void Mat44Disp(const mat44 mat, const std::string& title) {
+    NR_COUT << title << ":\n"
+        << mat.m[0][0] << "\t" << mat.m[0][1] << "\t" << mat.m[0][2] << "\t" << mat.m[0][3] << "\n"
+        << mat.m[1][0] << "\t" << mat.m[1][1] << "\t" << mat.m[1][2] << "\t" << mat.m[1][3] << "\n"
+        << mat.m[2][0] << "\t" << mat.m[2][1] << "\t" << mat.m[2][2] << "\t" << mat.m[2][3] << "\n"
+        << mat.m[3][0] << "\t" << mat.m[3][1] << "\t" << mat.m[3][2] << "\t" << mat.m[3][3] << std::endl;
 }
 /* *************************************************************** */
-mat44 reg_mat44_minus(mat44 const* A, mat44 const* B) {
-    mat44 R;
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-            R.m[i][j] = static_cast<float>(static_cast<double>(A->m[i][j]) - static_cast<double>(B->m[i][j]));
-        }
+/** @brief SVD
+* @param in input matrix to decompose - in place
+* @param size_m row
+* @param size_n colomn
+* @param w diagonal term
+* @param v rotation part
+*/
+template<class T>
+void Svd(T **in, const size_t size_m, const size_t size_n, T * w, T **v) {
+    if (size_m == 0 || size_n == 0)
+        NR_FATAL_ERROR("The specified matrix is empty");
+
+#ifdef _WIN32
+    long sm, sn, sn2;
+    long size__m = (long)size_m, size__n = (long)size_n;
+#else
+    size_t sm, sn, sn2;
+    size_t size__m = size_m, size__n = size_n;
+#endif
+    Eigen::MatrixXd m(size_m, size_n);
+
+    //Convert to Eigen matrix
+#ifdef _OPENMP
+#pragma omp parallel for default(none) \
+   shared(in,m, size__m, size__n) \
+   private(sn)
+#endif
+    for (sm = 0; sm < size__m; sm++)
+        for (sn = 0; sn < size__n; sn++)
+            m(sm, sn) = static_cast<double>(in[sm][sn]);
+
+    Eigen::JacobiSVD<Eigen::MatrixXd> Svd(m, Eigen::ComputeThinU | Eigen::ComputeThinV);
+
+#ifdef _OPENMP
+#pragma omp parallel for default(none) \
+   shared(in,Svd,v,w, size__n,size__m) \
+   private(sn2, sm)
+#endif
+    for (sn = 0; sn < size__n; sn++) {
+        w[sn] = static_cast<T>(Svd.singularValues()(sn));
+        for (sn2 = 0; sn2 < size__n; sn2++)
+            v[sn2][sn] = static_cast<T>(Svd.matrixV()(sn2, sn));
+        for (sm = 0; sm < size__m; sm++)
+            in[sm][sn] = static_cast<T>(Svd.matrixU()(sm, sn));
     }
-    return R;
-}
-/* *************************************************************** */
-mat44 operator-(mat44 A, mat44 B) {
-    return reg_mat44_minus(&A, &B);
-}
-/* *************************************************************** */
-void reg_mat44_eye(mat44 *mat) {
-    mat->m[0][0] = 1.f;
-    mat->m[0][1] = mat->m[0][2] = mat->m[0][3] = 0.f;
-    mat->m[1][1] = 1.f;
-    mat->m[1][0] = mat->m[1][2] = mat->m[1][3] = 0.f;
-    mat->m[2][2] = 1.f;
-    mat->m[2][0] = mat->m[2][1] = mat->m[2][3] = 0.f;
-    mat->m[3][3] = 1.f;
-    mat->m[3][0] = mat->m[3][1] = mat->m[3][2] = 0.f;
 }
+template void Svd<float>(float **in, const size_t m, const size_t n, float * w, float **v);
+template void Svd<double>(double **in, const size_t m, const size_t n, double * w, double **v);
 /* *************************************************************** */
-void reg_mat44_mul(mat44 const* mat,
-                   float const* in,
-                   float *out) {
-    out[0] = static_cast<float>(static_cast<double>(mat->m[0][0]) * static_cast<double>(in[0]) +
-                                static_cast<double>(mat->m[0][1]) * static_cast<double>(in[1]) +
-                                static_cast<double>(mat->m[0][2]) * static_cast<double>(in[2]) +
-                                static_cast<double>(mat->m[0][3]));
-    out[1] = static_cast<float>(static_cast<double>(mat->m[1][0]) * static_cast<double>(in[0]) +
-                                static_cast<double>(mat->m[1][1]) * static_cast<double>(in[1]) +
-                                static_cast<double>(mat->m[1][2]) * static_cast<double>(in[2]) +
-                                static_cast<double>(mat->m[1][3]));
-    out[2] = static_cast<float>(static_cast<double>(mat->m[2][0]) * static_cast<double>(in[0]) +
-                                static_cast<double>(mat->m[2][1]) * static_cast<double>(in[1]) +
-                                static_cast<double>(mat->m[2][2]) * static_cast<double>(in[2]) +
-                                static_cast<double>(mat->m[2][3]));
+template<class T>
+T Matrix2dDet(T **mat, const size_t m, const size_t n) {
+    if (m != n)
+        NR_FATAL_ERROR("The matrix have to be square: [" + std::to_string(m) + " " + std::to_string(n) + "]");
+
+    double res;
+    if (m == 2) {
+        res = static_cast<double>(mat[0][0]) * static_cast<double>(mat[1][1]) - static_cast<double>(mat[1][0]) * static_cast<double>(mat[0][1]);
+    } else if (m == 3) {
+        res = (static_cast<double>(mat[0][0]) * (static_cast<double>(mat[1][1]) * static_cast<double>(mat[2][2]) - static_cast<double>(mat[1][2]) * static_cast<double>(mat[2][1]))) -
+            (static_cast<double>(mat[0][1]) * (static_cast<double>(mat[1][0]) * static_cast<double>(mat[2][2]) - static_cast<double>(mat[1][2]) * static_cast<double>(mat[2][0]))) +
+            (static_cast<double>(mat[0][2]) * (static_cast<double>(mat[1][0]) * static_cast<double>(mat[2][1]) - static_cast<double>(mat[1][1]) * static_cast<double>(mat[2][0])));
+    } else {
+        // Convert to Eigen format
+        Eigen::MatrixXd eigenRes(m, n);
+        for (size_t i = 0; i < m; i++)
+            for (size_t j = 0; j < n; j++)
+                eigenRes(i, j) = static_cast<double>(mat[i][j]);
+        res = eigenRes.determinant();
+    }
+    return static_cast<T>(res);
 }
+template float Matrix2dDet<float>(float **mat, const size_t m, const size_t n);
+template double Matrix2dDet<double>(double **mat, const size_t m, const size_t n);
 /* *************************************************************** */
-void reg_mat44_mul(mat44 const* mat,
-                   double const* in,
-                   double *out) {
-    double matD[4][4];
-    for (int i = 0; i < 4; ++i)
-        for (int j = 0; j < 4; ++j)
-            matD[i][j] = static_cast<double>(mat->m[i][j]);
+void Mat33Expm(mat33 *tensorIn) {
+    int sm, sn;
+    Eigen::Matrix3d tensor;
+
+    // Convert to Eigen format
+    for (sm = 0; sm < 3; sm++) {
+        for (sn = 0; sn < 3; sn++) {
+            float val = tensorIn->m[sm][sn];
+            if (val != val) return;
+            tensor(sm, sn) = static_cast<double>(val);
+        }
+    }
+
+    // Compute exp(E)
+    tensor = tensor.exp();
+
+    // Convert the result to mat33 format
+    for (sm = 0; sm < 3; sm++)
+        for (sn = 0; sn < 3; sn++)
+            tensorIn->m[sm][sn] = static_cast<float>(tensor(sm, sn));
+}
+/* *************************************************************** */
+mat44 Mat44Expm(const mat44 *mat) {
+    mat44 X;
+    Eigen::Matrix4d m;
+    for (size_t i = 0; i < 4; ++i)
+        for (size_t j = 0; j < 4; ++j)
+            m(i, j) = static_cast<double>(mat->m[i][j]);
 
-    out[0] = matD[0][0] * in[0] +
-        matD[0][1] * in[1] +
-        matD[0][2] * in[2] +
-        matD[0][3];
-    out[1] = matD[1][0] * in[0] +
-        matD[1][1] * in[1] +
-        matD[1][2] * in[2] +
-        matD[1][3];
-    out[2] = matD[2][0] * in[0] +
-        matD[2][1] * in[1] +
-        matD[2][2] * in[2] +
-        matD[2][3];
-    return;
+    m = m.exp();
+
+    for (size_t i = 0; i < 4; ++i)
+        for (size_t j = 0; j < 4; ++j)
+            X.m[i][j] = static_cast<float>(m(i, j));
+
+    return X;
 }
 /* *************************************************************** */
-mat44 reg_mat44_mul(mat44 const* A, double scalar) {
+void Mat33Logm(mat33 *tensorIn) {
+    int sm, sn;
+    Eigen::Matrix3d tensor;
+
+    // Convert to Eigen format
+    bool all_zeros = true;
+    double det = 0;
+    for (sm = 0; sm < 3; sm++) {
+        for (sn = 0; sn < 3; sn++) {
+            float val = tensorIn->m[sm][sn];
+            if (val != 0.f) all_zeros = false;
+            if (val != val) return;
+            tensor(sm, sn) = static_cast<double>(val);
+        }
+    }
+    // Actually R case requires invertible and no negative real ev,
+    // but the only observed case so far was non-invertible.
+    // determinant is not a perfect check for invertibility and
+    // identity with zero not great either, but the alternative
+    // is a general eigensolver and the logarithm function should
+    // suceed unless convergence just isn't happening.
+    det = tensor.determinant();
+    if (all_zeros || det == 0) {
+        Mat33ToNan(tensorIn);
+        return;
+    }
+
+    // Compute the actual matrix log
+    tensor = tensor.log();
+
+    // Convert the result to mat33 format
+    for (sm = 0; sm < 3; sm++)
+        for (sn = 0; sn < 3; sn++)
+            tensorIn->m[sm][sn] = static_cast<float>(tensor(sm, sn));
+}
+/* *************************************************************** */
+mat44 Mat44Logm(const mat44 *mat) {
+    mat44 X;
+    Eigen::Matrix4d m;
+    for (char i = 0; i < 4; ++i)
+        for (char j = 0; j < 4; ++j)
+            m(i, j) = static_cast<double>(mat->m[i][j]);
+    m = m.log();
+    for (char i = 0; i < 4; ++i)
+        for (char j = 0; j < 4; ++j)
+            X.m[i][j] = static_cast<float>(m(i, j));
+    return X;
+}
+/* *************************************************************** */
+mat44 Mat44Avg2(const mat44 *A, const mat44 *B) {
     mat44 out;
-    out.m[0][0] = A->m[0][0] * scalar;
-    out.m[0][1] = A->m[0][1] * scalar;
-    out.m[0][2] = A->m[0][2] * scalar;
-    out.m[0][3] = A->m[0][3] * scalar;
-    out.m[1][0] = A->m[1][0] * scalar;
-    out.m[1][1] = A->m[1][1] * scalar;
-    out.m[1][2] = A->m[1][2] * scalar;
-    out.m[1][3] = A->m[1][3] * scalar;
-    out.m[2][0] = A->m[2][0] * scalar;
-    out.m[2][1] = A->m[2][1] * scalar;
-    out.m[2][2] = A->m[2][2] * scalar;
-    out.m[2][3] = A->m[2][3] * scalar;
-    out.m[3][0] = A->m[3][0] * scalar;
-    out.m[3][1] = A->m[3][1] * scalar;
-    out.m[3][2] = A->m[3][2] * scalar;
-    out.m[3][3] = A->m[3][3] * scalar;
-    return out;
-}
-/* *************************************************************** */
-void reg_mat44_disp(const mat44& mat, const std::string& title) {
-    NR_COUT << title << ":\n"
-        << mat.m[0][0] << "\t" << mat.m[0][1] << "\t" << mat.m[0][2] << "\t" << mat.m[0][3] << "\n"
-        << mat.m[1][0] << "\t" << mat.m[1][1] << "\t" << mat.m[1][2] << "\t" << mat.m[1][3] << "\n"
-        << mat.m[2][0] << "\t" << mat.m[2][1] << "\t" << mat.m[2][2] << "\t" << mat.m[2][3] << "\n"
-        << mat.m[3][0] << "\t" << mat.m[3][1] << "\t" << mat.m[3][2] << "\t" << mat.m[3][3] << std::endl;
-}
-/* *************************************************************** */
-//is it square distance or just distance?
-// Helper function: Get the square of the Euclidean distance
-double get_square_distance3D(float * first_point3D, float * second_point3D) {
-    return sqrt(Square(first_point3D[0] - second_point3D[0]) +
-                Square(first_point3D[1] - second_point3D[1]) +
-                Square(first_point3D[2] - second_point3D[2]));
+    mat44 logA = Mat44Logm(A);
+    mat44 logB = Mat44Logm(B);
+    for (int i = 0; i < 4; ++i) {
+        logA.m[3][i] = 0.f;
+        logB.m[3][i] = 0.f;
+    }
+    logA = logA + logB;
+    out = logA * 0.5;
+    return Mat44Expm(&out);
 }
 /* *************************************************************** */
-//is it square distance or just distance?
-double get_square_distance2D(float * first_point2D, float * second_point2D) {
-    return sqrt(Square(first_point2D[0] - second_point2D[0]) +
-                Square(first_point2D[1] - second_point2D[1]));
-}
+} // namespace NiftyReg
 /* *************************************************************** */
diff --git a/reg-lib/cpu/Maths.hpp b/reg-lib/cpu/Maths.hpp
new file mode 100644
index 00000000..56782eda
--- /dev/null
+++ b/reg-lib/cpu/Maths.hpp
@@ -0,0 +1,363 @@
+/**
+ * @file Maths.hpp
+ * @brief Library that contains small math routines
+ * @author Marc Modat
+ * @date 25/03/2009
+ *
+ *  Created by Marc Modat on 25/03/2009.
+ *  Copyright (c) 2009-2018, University College London
+ *  Copyright (c) 2018, NiftyReg Developers.
+ *  All rights reserved.
+ *  See the LICENSE.txt file in the nifty_reg root folder
+ *
+ */
+
+#pragma once
+
+#include "RNifti.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#if USE_SSE
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+#endif
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+#ifdef __CUDACC__
+#define DEVICE  __host__ __device__
+#else
+#define DEVICE
+#endif
+
+typedef enum {
+    DEF_FIELD,
+    DISP_FIELD,
+    CUB_SPLINE_GRID,
+    DEF_VEL_FIELD,
+    DISP_VEL_FIELD,
+    SPLINE_VEL_GRID,
+    LIN_SPLINE_GRID
+} NREG_TRANS_TYPE;
+
+/* *************************************************************** */
+namespace NiftyReg {
+/* *************************************************************** */
+// The functions in the standard library are slower; so, these are implemented
+template<typename T>
+DEVICE inline T Square(const T& x) {
+    return x * x;
+}
+template<typename T>
+DEVICE inline T Cube(const T& x) {
+    return x * x * x;
+}
+template<typename T>
+DEVICE inline int Floor(const T& x) {
+    const int i = static_cast<int>(x);
+    return i - (x < i);
+}
+template<typename T>
+DEVICE inline int Ceil(const T& x) {
+    const int i = static_cast<int>(x);
+    return i + (x > i);
+}
+template<typename T>
+DEVICE inline int Round(const T& x) {
+    return static_cast<int>(x + (x >= 0 ? 0.5 : -0.5));
+}
+/* *************************************************************** */
+DEVICE inline void Divide(const int num, const int denom, int& quot, int& rem) {
+    // This will be optimised by the compiler into a single div instruction
+    quot = num / denom;
+    rem = num % denom;
+}
+/* *************************************************************** */
+template<class T>
+DEVICE inline T* Matrix1dAlloc(const size_t arraySize) {
+    return static_cast<T*>(malloc(arraySize * sizeof(T)));
+}
+/* *************************************************************** */
+template<class T>
+DEVICE inline void Matrix1dDealloc(T *mat) {
+    free(mat);
+}
+/* *************************************************************** */
+template<class T>
+DEVICE inline T** Matrix2dAlloc(const size_t arraySizeX, const size_t arraySizeY) {
+    T **res;
+    res = static_cast<T**>(malloc(arraySizeX * sizeof(T*)));
+    for (size_t i = 0; i < arraySizeX; i++)
+        res[i] = static_cast<T*>(malloc(arraySizeY * sizeof(T)));
+    return res;
+}
+/* *************************************************************** */
+template<class T>
+DEVICE inline void Matrix2dDealloc(const size_t arraySizeX, T **mat) {
+    for (size_t i = 0; i < arraySizeX; i++)
+        free(mat[i]);
+    free(mat);
+}
+/* *************************************************************** */
+template<class T>
+DEVICE inline T** Matrix2dTranspose(T **mat, const size_t arraySizeX, const size_t arraySizeY) {
+    T **res;
+    res = static_cast<T**>(malloc(arraySizeY * sizeof(T*)));
+    for (size_t i = 0; i < arraySizeY; i++)
+        res[i] = static_cast<T*>(malloc(arraySizeX * sizeof(T)));
+    for (size_t i = 0; i < arraySizeX; i++)
+        for (size_t j = 0; j < arraySizeY; j++)
+            res[j][i] = mat[i][j];
+    return res;
+}
+/* *************************************************************** */
+template<class T>
+T** Matrix2dMultiply(T **mat1, const size_t mat1X, const size_t mat1Y, T **mat2, const size_t mat2X, const size_t mat2Y, const bool transposeMat2);
+template<class T>
+void Matrix2dMultiply(T **mat1, const size_t mat1X, const size_t mat1Y, T **mat2, const size_t mat2X, const size_t mat2Y, T **res, const bool transposeMat2);
+/* *************************************************************** */
+template<class T>
+T* Matrix2dVectorMultiply(T **mat, const size_t m, const size_t n, T *vect);
+template<class T>
+void Matrix2dVectorMultiply(T **mat, const size_t m, const size_t n, T *vect, T *res);
+/* *************************************************************** */
+/// @brief Subtract two 3-by-3 matrices
+DEVICE inline mat33 operator-(const mat33 A, const mat33 B) {
+    mat33 R;
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            R.m[i][j] = static_cast<float>(static_cast<double>(A.m[i][j]) - static_cast<double>(B.m[i][j]));
+    return R;
+}
+/* *************************************************************** */
+/// @brief Multiply two 3-by-3 matrices
+DEVICE inline mat33 operator*(const mat33 A, const mat33 B) {
+    mat33 R;
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            R.m[i][j] = static_cast<float>(static_cast<double>(A.m[i][0]) * static_cast<double>(B.m[0][j]) +
+                                           static_cast<double>(A.m[i][1]) * static_cast<double>(B.m[1][j]) +
+                                           static_cast<double>(A.m[i][2]) * static_cast<double>(B.m[2][j]));
+    return R;
+}
+/* *************************************************************** */
+/// @brief Multiply a vector with a 3-by-3 matrix
+DEVICE inline void Mat33Mul(const mat33 mat, const float(&in)[2], float(&out)[2]) {
+    out[0] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat.m[0][0]) +
+                                static_cast<double>(in[1]) * static_cast<double>(mat.m[0][1]) +
+                                static_cast<double>(mat.m[0][2]));
+    out[1] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat.m[1][0]) +
+                                static_cast<double>(in[1]) * static_cast<double>(mat.m[1][1]) +
+                                static_cast<double>(mat.m[1][2]));
+}
+/* *************************************************************** */
+/// @brief Multiply a vector with a 3-by-3 matrix
+DEVICE inline void Mat33Mul(const mat44 mat, const float (&in)[2], float (&out)[2]) {
+    out[0] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat.m[0][0]) +
+                                static_cast<double>(in[1]) * static_cast<double>(mat.m[0][1]) +
+                                static_cast<double>(mat.m[0][3]));
+    out[1] = static_cast<float>(static_cast<double>(in[0]) * static_cast<double>(mat.m[1][0]) +
+                                static_cast<double>(in[1]) * static_cast<double>(mat.m[1][1]) +
+                                static_cast<double>(mat.m[1][3]));
+}
+/* *************************************************************** */
+/// @brief Multiply a scalar with a 3-by-3 matrix multiplied by a vector
+template<bool is3d>
+DEVICE inline void Mat33Mul(const mat33 mat, const float (&in)[3], const float weight, float (&out)[3]) {
+    out[0] = weight * (mat.m[0][0] * in[0] + mat.m[1][0] * in[1] + mat.m[2][0] * in[2]);
+    out[1] = weight * (mat.m[0][1] * in[0] + mat.m[1][1] * in[1] + mat.m[2][1] * in[2]);
+    if constexpr (is3d)
+        out[2] = weight * (mat.m[0][2] * in[0] + mat.m[1][2] * in[1] + mat.m[2][2] * in[2]);
+}
+/* *************************************************************** */
+/// @brief Transpose a 3-by-3 matrix
+DEVICE inline mat33 Mat33Trans(const mat33 A) {
+    mat33 R;
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            R.m[j][i] = A.m[i][j];
+    return R;
+}
+/* *************************************************************** */
+/// @brief Diagonalize a 3-by-3 matrix
+void Mat33Diagonalize(const mat33 *A, mat33 *Q, mat33 *D);
+/* *************************************************************** */
+/// @brief Set up a 3-by-3 matrix with an identity
+DEVICE inline void Mat33Eye(mat33 *mat) {
+    mat->m[0][0] = 1.f;
+    mat->m[0][1] = mat->m[0][2] = 0.f;
+    mat->m[1][1] = 1.f;
+    mat->m[1][0] = mat->m[1][2] = 0.f;
+    mat->m[2][2] = 1.f;
+    mat->m[2][0] = mat->m[2][1] = 0.f;
+}
+/* *************************************************************** */
+DEVICE inline void Mat33ToNan(mat33 *A) {
+    for (int i = 0; i < 3; ++i)
+        for (int j = 0; j < 3; ++j)
+            A->m[i][j] = std::numeric_limits<float>::quiet_NaN();
+}
+/* *************************************************************** */
+/// @brief Transform a mat44 to a mat33 matrix
+DEVICE inline mat33 Mat44ToMat33(const mat44 *A) {
+    mat33 out;
+    out.m[0][0] = A->m[0][0];
+    out.m[0][1] = A->m[0][1];
+    out.m[0][2] = A->m[0][2];
+    out.m[1][0] = A->m[1][0];
+    out.m[1][1] = A->m[1][1];
+    out.m[1][2] = A->m[1][2];
+    out.m[2][0] = A->m[2][0];
+    out.m[2][1] = A->m[2][1];
+    out.m[2][2] = A->m[2][2];
+    return out;
+}
+/* *************************************************************** */
+template<class T>
+void HeapSort(T *array_tmp, int blockNum);
+void HeapSort(float *array_tmp, int *index_tmp, int blockNum);
+/* *************************************************************** */
+DEVICE inline bool operator==(const mat44 A, const mat44 B) {
+    for (char i = 0; i < 4; ++i)
+        for (char j = 0; j < 4; ++j)
+            if (A.m[i][j] != B.m[i][j])
+                return false;
+    return true;
+}
+/* *************************************************************** */
+DEVICE inline bool operator!=(const mat44 A, const mat44 B) {
+    return !(A == B);
+}
+/* *************************************************************** */
+/// @brief Multiply two 4-by-4 matrices
+DEVICE inline mat44 operator*(const mat44 A, const mat44 B) {
+    mat44 R;
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            R.m[i][j] = static_cast<float>(static_cast<double>(A.m[i][0]) * static_cast<double>(B.m[0][j]) +
+                                           static_cast<double>(A.m[i][1]) * static_cast<double>(B.m[1][j]) +
+                                           static_cast<double>(A.m[i][2]) * static_cast<double>(B.m[2][j]) +
+                                           static_cast<double>(A.m[i][3]) * static_cast<double>(B.m[3][j]));
+    return R;
+}
+/* *************************************************************** */
+/// @brief Multiply a 4-by-4 matrix with a scalar
+DEVICE inline mat44 operator*(const mat44 mat, const double scalar) {
+    mat44 out;
+    out.m[0][0] = mat.m[0][0] * scalar;
+    out.m[0][1] = mat.m[0][1] * scalar;
+    out.m[0][2] = mat.m[0][2] * scalar;
+    out.m[0][3] = mat.m[0][3] * scalar;
+    out.m[1][0] = mat.m[1][0] * scalar;
+    out.m[1][1] = mat.m[1][1] * scalar;
+    out.m[1][2] = mat.m[1][2] * scalar;
+    out.m[1][3] = mat.m[1][3] * scalar;
+    out.m[2][0] = mat.m[2][0] * scalar;
+    out.m[2][1] = mat.m[2][1] * scalar;
+    out.m[2][2] = mat.m[2][2] * scalar;
+    out.m[2][3] = mat.m[2][3] * scalar;
+    out.m[3][0] = mat.m[3][0] * scalar;
+    out.m[3][1] = mat.m[3][1] * scalar;
+    out.m[3][2] = mat.m[3][2] * scalar;
+    out.m[3][3] = mat.m[3][3] * scalar;
+    return out;
+}
+/* *************************************************************** */
+/// @brief Multiply a vector with a 4-by-4 matrix
+template<class T, bool is3d=true>
+DEVICE inline void Mat44Mul(const mat44 mat, const T(&in)[3], T(&out)[3]) {
+    out[0] = static_cast<T>(static_cast<double>(mat.m[0][0]) * static_cast<double>(in[0]) +
+                            static_cast<double>(mat.m[0][1]) * static_cast<double>(in[1]) +
+                            static_cast<double>(mat.m[0][2]) * static_cast<double>(in[2]) +
+                            static_cast<double>(mat.m[0][3]));
+    out[1] = static_cast<T>(static_cast<double>(mat.m[1][0]) * static_cast<double>(in[0]) +
+                            static_cast<double>(mat.m[1][1]) * static_cast<double>(in[1]) +
+                            static_cast<double>(mat.m[1][2]) * static_cast<double>(in[2]) +
+                            static_cast<double>(mat.m[1][3]));
+    if constexpr (is3d)
+        out[2] = static_cast<T>(static_cast<double>(mat.m[2][0]) * static_cast<double>(in[0]) +
+                                static_cast<double>(mat.m[2][1]) * static_cast<double>(in[1]) +
+                                static_cast<double>(mat.m[2][2]) * static_cast<double>(in[2]) +
+                                static_cast<double>(mat.m[2][3]));
+}
+/* *************************************************************** */
+/// @brief Add two 4-by-4 matrices
+DEVICE inline mat44 operator+(const mat44 A, const mat44 B) {
+    mat44 R;
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            R.m[i][j] = static_cast<float>(static_cast<double>(A.m[i][j]) + static_cast<double>(B.m[i][j]));
+    return R;
+}
+/* *************************************************************** */
+/// @brief Subtract two 4-by-4 matrices
+DEVICE inline mat44 operator-(const mat44 A, const mat44 B) {
+    mat44 R;
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++)
+            R.m[i][j] = static_cast<float>(static_cast<double>(A.m[i][j]) - static_cast<double>(B.m[i][j]));
+    return R;
+}
+/* *************************************************************** */
+/// @brief Set up a 4-by-4 matrix with an identity
+DEVICE inline void Mat44Eye(mat44 *mat) {
+    mat->m[0][0] = 1.f;
+    mat->m[0][1] = mat->m[0][2] = mat->m[0][3] = 0.f;
+    mat->m[1][1] = 1.f;
+    mat->m[1][0] = mat->m[1][2] = mat->m[1][3] = 0.f;
+    mat->m[2][2] = 1.f;
+    mat->m[2][0] = mat->m[2][1] = mat->m[2][3] = 0.f;
+    mat->m[3][3] = 1.f;
+    mat->m[3][0] = mat->m[3][1] = mat->m[3][2] = 0.f;
+}
+/* *************************************************************** */
+/// @brief Compute the determinant of a 4-by-4 matrix
+template<class T>
+T Mat44Det(const mat44 *A);
+/* *************************************************************** */
+/// @brief Display a mat44 matrix
+void Mat44Disp(const mat44 mat, const std::string& title);
+/* *************************************************************** */
+//is it square distance or just distance?
+DEVICE inline double SquareDistance2d(const float *first_point2D, const float *second_point2D) {
+    return sqrt(Square(first_point2D[0] - second_point2D[0]) +
+                Square(first_point2D[1] - second_point2D[1]));
+}
+/* *************************************************************** */
+//is it square distance or just distance?
+DEVICE inline double SquareDistance3d(const float *first_point3D, const float *second_point3D) {
+    return sqrt(Square(first_point3D[0] - second_point3D[0]) +
+                Square(first_point3D[1] - second_point3D[1]) +
+                Square(first_point3D[2] - second_point3D[2]));
+}
+/* *************************************************************** */
+template<class T>
+void Svd(T **in, const size_t m, const size_t n, T *w, T **v);
+/* *************************************************************** */
+template<class T>
+T Matrix2dDet(T **mat, const size_t m, const size_t n);
+/* *************************************************************** */
+/// @brief Compute the log of a 3-by-3 matrix
+void Mat33Expm(mat33 *tensorIn);
+/* *************************************************************** */
+/// @brief Compute the exp of a 4-by-4 matrix
+mat44 Mat44Expm(const mat44 *mat);
+/* *************************************************************** */
+/// @brief Compute the log of a 3-by-3 matrix
+void Mat33Logm(mat33 *tensorIn);
+/* *************************************************************** */
+/// @brief Compute the log of a 4-by-4 matrix
+mat44 Mat44Logm(const mat44 *mat);
+/* *************************************************************** */
+/// @brief Compute the average of two matrices using a log-euclidean framework
+mat44 Mat44Avg2(const mat44 *A, const mat44 *b);
+/* *************************************************************** */
+} // namespace NiftyReg
+/* *************************************************************** */
diff --git a/reg-lib/cpu/_reg_blockMatching.cpp b/reg-lib/cpu/_reg_blockMatching.cpp
index e91ef03a..fce081f5 100755
--- a/reg-lib/cpu/_reg_blockMatching.cpp
+++ b/reg-lib/cpu/_reg_blockMatching.cpp
@@ -185,7 +185,7 @@ void _reg_set_active_blocks(nifti_image *referenceImage, _reg_blockMatchingParam
    params->activeBlockNumber = params->activeBlockNumber < ((int)params->totalBlockNumber - unusableBlock) ? params->activeBlockNumber : (params->totalBlockNumber - unusableBlock);
    //params->activeBlockNumber = params->totalBlockNumber - unusableBlock;
 
-   reg_heapSort(varianceArray, indexArray, params->totalBlockNumber);
+   HeapSort(varianceArray, indexArray, params->totalBlockNumber);
    int *indexArrayPtr = &indexArray[params->totalBlockNumber - 1];
    int count = 0;
    for (int i = 0; i < params->activeBlockNumber; i++) {
@@ -432,13 +432,13 @@ void block_matching_method2D(nifti_image * reference, nifti_image * warped, _reg
             bestDisplacement[1] += referencePosition_temp[1];
             bestDisplacement[2] = 0.0f;
 
-            reg_mat44_mul(referenceMatrix_xyz, referencePosition_temp, tempPosition);
+            Mat44Mul(*referenceMatrix_xyz, referencePosition_temp, tempPosition);
             z = 2 * params->totalBlock[blockIndex];
 
             params->referencePosition[z] = tempPosition[0];
             params->referencePosition[z + 1] = tempPosition[1];
 
-            reg_mat44_mul(referenceMatrix_xyz, bestDisplacement, tempPosition);
+            Mat44Mul(*referenceMatrix_xyz, bestDisplacement, tempPosition);
 
             params->warpedPosition[z] = tempPosition[0];
             params->warpedPosition[z + 1] = tempPosition[1];
@@ -664,13 +664,13 @@ void block_matching_method3D(nifti_image * reference,
                bestDisplacement[1] += referencePosition_temp[1];
                bestDisplacement[2] += referencePosition_temp[2];
 
-               reg_mat44_mul(referenceMatrix_xyz, referencePosition_temp, tempPosition);
+               Mat44Mul(*referenceMatrix_xyz, referencePosition_temp, tempPosition);
                z = 3 * params->totalBlock[blockIndex];
                params->referencePosition[z] = tempPosition[0];
                params->referencePosition[z+1] = tempPosition[1];
                params->referencePosition[z+2] = tempPosition[2];
 
-               reg_mat44_mul(referenceMatrix_xyz, bestDisplacement, tempPosition);
+               Mat44Mul(*referenceMatrix_xyz, bestDisplacement, tempPosition);
                params->warpedPosition[z] = tempPosition[0];
                params->warpedPosition[z + 1] = tempPosition[1];
                params->warpedPosition[z + 2] = tempPosition[2];
@@ -757,7 +757,7 @@ void optimize(_reg_blockMatchingParam *params,
          //Can have undefined = NaN in the warped image now -
          //to not loose the correspondence - so check that:
          if(in[0] == in[0]){
-            reg_mat33_mul(transformation_matrix, in, out);
+            Mat33Mul(*transformation_matrix, in, out);
 
             referencePositionVect.push_back(params->referencePosition[index]);
             referencePositionVect.push_back(params->referencePosition[index+1]);
@@ -802,7 +802,7 @@ void optimize(_reg_blockMatchingParam *params,
          //Can have undefined = NaN in the warped image now -
          //to not loose the correspondence - so check that:
          if(in[0] == in[0]){
-            reg_mat44_mul(transformation_matrix, in, out);
+            Mat44Mul(*transformation_matrix, in, out);
 
             referencePositionVect.push_back(params->referencePosition[index]);
             referencePositionVect.push_back(params->referencePosition[index+1]);
diff --git a/reg-lib/cpu/_reg_blockMatching.h b/reg-lib/cpu/_reg_blockMatching.h
index f370df90..bf8095a0 100755
--- a/reg-lib/cpu/_reg_blockMatching.h
+++ b/reg-lib/cpu/_reg_blockMatching.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "_reg_maths.h"
+#include "Maths.hpp"
 
 #define TOLERANCE 0.001
 #define MAX_ITERATIONS 30
diff --git a/reg-lib/cpu/_reg_globalTrans.cpp b/reg-lib/cpu/_reg_globalTrans.cpp
index a2e8ef60..dcf59de7 100755
--- a/reg-lib/cpu/_reg_globalTrans.cpp
+++ b/reg-lib/cpu/_reg_globalTrans.cpp
@@ -11,8 +11,7 @@
  */
 
 #include "_reg_globalTrans.h"
-#include "_reg_maths.h"
-#include "_reg_maths_eigen.h"
+#include "Maths.hpp"
 
 /* *************************************************************** */
 /* *************************************************************** */
@@ -28,15 +27,13 @@ void reg_affine_deformationField2D(mat44 *affineTransformation,
 
    mat44 *referenceMatrix;
    if(deformationFieldImage->sform_code>0)
-   {
-      referenceMatrix=&(deformationFieldImage->sto_xyz);
-   }
-   else referenceMatrix=&(deformationFieldImage->qto_xyz);
+      referenceMatrix=&deformationFieldImage->sto_xyz;
+   else referenceMatrix=&deformationFieldImage->qto_xyz;
 
    mat44 transformationMatrix;
    if(composition)
       transformationMatrix = *affineTransformation;
-   else transformationMatrix = reg_mat44_mul(affineTransformation, referenceMatrix);
+   else transformationMatrix = *affineTransformation * *referenceMatrix;
 
    double voxel[3]={0,0,0}, position[3]={0,0,0};
    int x=0, y=0;
@@ -61,9 +58,9 @@ void reg_affine_deformationField2D(mat44 *affineTransformation,
             {
                voxel[0] = (double) deformationFieldPtrX[index];
                voxel[1] = (double) deformationFieldPtrY[index];
-               reg_mat44_mul(&transformationMatrix, voxel, position);
+               Mat44Mul(transformationMatrix, voxel, position);
             }
-            else reg_mat44_mul(&transformationMatrix, voxel, position);
+            else Mat44Mul(transformationMatrix, voxel, position);
 
             /* the deformation field (real coordinates) is stored */
             deformationFieldPtrX[index] = (FieldTYPE) position[0];
@@ -87,15 +84,13 @@ void reg_affine_deformationField3D(mat44 *affineTransformation,
 
    mat44 *referenceMatrix;
    if(deformationFieldImage->sform_code>0)
-   {
-      referenceMatrix=&(deformationFieldImage->sto_xyz);
-   }
-   else referenceMatrix=&(deformationFieldImage->qto_xyz);
+      referenceMatrix=&deformationFieldImage->sto_xyz;
+   else referenceMatrix=&deformationFieldImage->qto_xyz;
 
    mat44 transformationMatrix;
    if(composition)
       transformationMatrix = *affineTransformation;
-   else transformationMatrix = reg_mat44_mul(affineTransformation, referenceMatrix);
+   else transformationMatrix = *affineTransformation * *referenceMatrix;
 
    double voxel[3]={0,0,0}, position[3]={0,0,0};
    int x=0, y=0, z=0;
@@ -124,7 +119,7 @@ void reg_affine_deformationField3D(mat44 *affineTransformation,
                   voxel[1]= (double) deformationFieldPtrY[index];
                   voxel[2]= (double) deformationFieldPtrZ[index];
                }
-               reg_mat44_mul(&transformationMatrix, voxel, position);
+               Mat44Mul(transformationMatrix, voxel, position);
 
                /* the deformation field (real coordinates) is stored */
                deformationFieldPtrX[index] = (FieldTYPE) position[0];
@@ -207,9 +202,9 @@ void estimate_rigid_transformation2D(float** points1, float** points2, int num_p
    centroid_warpedFloat[0] = static_cast<float>(centroid_warped[0]);
    centroid_warpedFloat[1] = static_cast<float>(centroid_warped[1]);
 
-   float * w = reg_matrix1DAllocate<float>(2);
-   float **v = reg_matrix2DAllocate<float>(2, 2);
-   float **r = reg_matrix2DAllocate<float>(2, 2);
+   float * w = Matrix1dAlloc<float>(2);
+   float **v = Matrix2dAlloc<float>(2, 2);
+   float **r = Matrix2dAlloc<float>(2, 2);
 
    // Demean the input points
    for (int j = 0; j < num_points; ++j) {
@@ -220,24 +215,24 @@ void estimate_rigid_transformation2D(float** points1, float** points2, int num_p
       points2[j][1] = static_cast<float>(static_cast<double>(points2[j][1]) - static_cast<double>(centroid_warpedFloat[1]));
    }
 
-   float **p1t = reg_matrix2DTranspose<float>(points1, num_points, 2);
-   float **u = reg_matrix2DMultiply<float>(p1t,2, num_points, points2, num_points, 2, false);
+   float **p1t = Matrix2dTranspose<float>(points1, num_points, 2);
+   float **u = Matrix2dMultiply<float>(p1t,2, num_points, points2, num_points, 2, false);
 
-   svd(u, 2, 2, w, v);
+   Svd(u, 2, 2, w, v);
 
    // Calculate transpose
-   float **ut = reg_matrix2DTranspose<float>(u, 2, 2);
+   float **ut = Matrix2dTranspose<float>(u, 2, 2);
 
    // Calculate the rotation matrix
-   reg_matrix2DMultiply<float>(v, 2, 2, ut, 2, 2, r, false);
+   Matrix2dMultiply<float>(v, 2, 2, ut, 2, 2, r, false);
 
-   float det = reg_matrix2DDet<float>(r, 2, 2);
+   float det = Matrix2dDet<float>(r, 2, 2);
 
    // Take care of possible reflection
    if (det < 0) {
       v[0][1] = -v[0][1];
       v[1][1] = -v[1][1];
-      reg_matrix2DMultiply<float>(v, 2, 2, ut, 2, 2, r, false);
+      Matrix2dMultiply<float>(v, 2, 2, ut, 2, 2, r, false);
    }
 
    // Calculate the translation
@@ -271,12 +266,12 @@ void estimate_rigid_transformation2D(float** points1, float** points2, int num_p
    transformation->m[3][3] = 1.0f;
 
    // Do the deletion here
-   reg_matrix2DDeallocate(2, u);
-   reg_matrix1DDeallocate(w);
-   reg_matrix2DDeallocate(2, v);
-   reg_matrix2DDeallocate(2, ut);
-   reg_matrix2DDeallocate(2, r);
-   //    reg_matrix2DDeallocate(2, p1t);
+   Matrix2dDealloc(2, u);
+   Matrix1dDealloc(w);
+   Matrix2dDealloc(2, v);
+   Matrix2dDealloc(2, ut);
+   Matrix2dDealloc(2, r);
+   //    Matrix2dDealloc(2, p1t);
    for(size_t dance=0;dance<2;++dance) free(p1t[dance]); free(p1t);
 }
 /* *************************************************************** */
@@ -284,8 +279,8 @@ void estimate_rigid_transformation2D(std::vector<_reg_sorted_point2D> &points, m
 {
 
    unsigned num_points = points.size();
-   float** points1 = reg_matrix2DAllocate<float>(num_points, 2);
-   float** points2 = reg_matrix2DAllocate<float>(num_points, 2);
+   float** points1 = Matrix2dAlloc<float>(num_points, 2);
+   float** points2 = Matrix2dAlloc<float>(num_points, 2);
    for (unsigned i = 0; i < num_points; i++) {
       points1[i][0] = points[i].reference[0];
       points1[i][1] = points[i].reference[1];
@@ -294,8 +289,8 @@ void estimate_rigid_transformation2D(std::vector<_reg_sorted_point2D> &points, m
    }
    estimate_rigid_transformation2D(points1, points2, num_points, transformation);
    //FREE MEMORY
-   reg_matrix2DDeallocate(num_points, points1);
-   reg_matrix2DDeallocate(num_points, points2);
+   Matrix2dDealloc(num_points, points1);
+   Matrix2dDealloc(num_points, points2);
 }
 /* *************************************************************** */
 void estimate_rigid_transformation3D(float** points1, float** points2, int num_points, mat44 * transformation)
@@ -335,9 +330,9 @@ void estimate_rigid_transformation3D(float** points1, float** points2, int num_p
    centroid_warpedFloat[1] = static_cast<float>(centroid_warped[1]);
    centroid_warpedFloat[2] = static_cast<float>(centroid_warped[2]);
 
-   float * w = reg_matrix1DAllocate<float>(3);
-   float **v  = reg_matrix2DAllocate<float>(3, 3);
-   float **r  = reg_matrix2DAllocate<float>(3, 3);
+   float * w = Matrix1dAlloc<float>(3);
+   float **v  = Matrix2dAlloc<float>(3, 3);
+   float **r  = Matrix2dAlloc<float>(3, 3);
 
    // Demean the input points
    for (int j = 0; j < num_points; ++j) {
@@ -349,27 +344,27 @@ void estimate_rigid_transformation3D(float** points1, float** points2, int num_p
       points2[j][1] = static_cast<float>(static_cast<double>(points2[j][1]) - static_cast<double>(centroid_warpedFloat[1]));
       points2[j][2] = static_cast<float>(static_cast<double>(points2[j][2]) - static_cast<double>(centroid_warpedFloat[2]));
    }
-   //T** reg_matrix2DTranspose(T** mat, size_t arraySizeX, size_t arraySizeY);
-   //T** reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2);
-   float **p1t = reg_matrix2DTranspose<float>(points1, num_points, 3);
-   float **u = reg_matrix2DMultiply<float>(p1t,3, num_points, points2, num_points, 3, false);
+   //T** Matrix2dTranspose(T** mat, size_t arraySizeX, size_t arraySizeY);
+   //T** Matrix2dMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2);
+   float **p1t = Matrix2dTranspose<float>(points1, num_points, 3);
+   float **u = Matrix2dMultiply<float>(p1t,3, num_points, points2, num_points, 3, false);
 
-   svd(u, 3, 3, w, v);
+   Svd(u, 3, 3, w, v);
 
    // Calculate transpose
-   float **ut = reg_matrix2DTranspose<float>(u, 3, 3);
+   float **ut = Matrix2dTranspose<float>(u, 3, 3);
 
    // Calculate the rotation matrix
-   reg_matrix2DMultiply<float>(v, 3, 3, ut, 3, 3, r, false);
+   Matrix2dMultiply<float>(v, 3, 3, ut, 3, 3, r, false);
 
-   float det = reg_matrix2DDet<float>(r, 3, 3);
+   float det = Matrix2dDet<float>(r, 3, 3);
 
    // Take care of possible reflection
    if (det < 0) {
       v[0][2] = -v[0][2];
       v[1][2] = -v[1][2];
       v[2][2] = -v[2][2];
-      reg_matrix2DMultiply<float>(v, 3, 3, ut, 3, 3, r, false);
+      Matrix2dMultiply<float>(v, 3, 3, ut, 3, 3, r, false);
    }
 
    // Calculate the translation
@@ -407,19 +402,19 @@ void estimate_rigid_transformation3D(float** points1, float** points2, int num_p
    transformation->m[3][3] = 1.0f;
 
    // Do the deletion here
-   reg_matrix2DDeallocate(3, u);
-   reg_matrix1DDeallocate(w);
-   reg_matrix2DDeallocate(3, v);
-   reg_matrix2DDeallocate(3, ut);
-   reg_matrix2DDeallocate(3, r);
-   reg_matrix2DDeallocate(3, p1t);
+   Matrix2dDealloc(3, u);
+   Matrix1dDealloc(w);
+   Matrix2dDealloc(3, v);
+   Matrix2dDealloc(3, ut);
+   Matrix2dDealloc(3, r);
+   Matrix2dDealloc(3, p1t);
 }
 /* *************************************************************** */
 void estimate_rigid_transformation3D(std::vector<_reg_sorted_point3D> &points, mat44 * transformation)
 {
    unsigned num_points = points.size();
-   float** points1 = reg_matrix2DAllocate<float>(num_points, 3);
-   float** points2 = reg_matrix2DAllocate<float>(num_points, 3);
+   float** points1 = Matrix2dAlloc<float>(num_points, 3);
+   float** points2 = Matrix2dAlloc<float>(num_points, 3);
    for (unsigned i = 0; i < num_points; i++) {
       points1[i][0] = points[i].reference[0];
       points1[i][1] = points[i].reference[1];
@@ -430,8 +425,8 @@ void estimate_rigid_transformation3D(std::vector<_reg_sorted_point3D> &points, m
    }
    estimate_rigid_transformation3D(points1, points2, num_points, transformation);
    //FREE MEMORY
-   reg_matrix2DDeallocate(num_points, points1);
-   reg_matrix2DDeallocate(num_points, points2);
+   Matrix2dDealloc(num_points, points1);
+   Matrix2dDealloc(num_points, points2);
 }
 /* *************************************************************** */
 void estimate_affine_transformation2D(float** points1, float** points2, int num_points, mat44 * transformation)
@@ -439,7 +434,7 @@ void estimate_affine_transformation2D(float** points1, float** points2, int num_
    //We assume same number of points in both arrays
    int num_equations = num_points * 2;
    unsigned c = 0;
-   float** A = reg_matrix2DAllocate<float>(num_equations, 6);
+   float** A = Matrix2dAlloc<float>(num_equations, 6);
 
    for (int k = 0; k < num_points; ++k) {
       c = k * 2;
@@ -455,10 +450,10 @@ void estimate_affine_transformation2D(float** points1, float** points2, int num_
       A[c + 1][5] = 1.0f;
    }
 
-   float* w  = reg_matrix1DAllocate<float>(6);
-   float** v = reg_matrix2DAllocate<float>(6, 6);
+   float* w  = Matrix1dAlloc<float>(6);
+   float** v = Matrix2dAlloc<float>(6, 6);
 
-   svd(A, num_equations, 6, w, v);
+   Svd(A, num_equations, 6, w, v);
 
    for (unsigned k = 0; k < 6; ++k) {
       if (w[k] < 0.0001) {
@@ -479,19 +474,19 @@ void estimate_affine_transformation2D(float** points1, float** points2, int num_
       }
    }
 
-   float** r = reg_matrix2DAllocate<float>(6, num_equations);
-   reg_matrix2DMultiply<float>(v, 6, 6, A, num_equations, 6, r, true);
+   float** r = Matrix2dAlloc<float>(6, num_equations);
+   Matrix2dMultiply<float>(v, 6, 6, A, num_equations, 6, r, true);
    // Now r contains the pseudoinverse
    // Create vector b and then multiple r*b to get the affine paramsA
-   float* b = reg_matrix1DAllocate<float>(num_equations);
+   float* b = Matrix1dAlloc<float>(num_equations);
    for (int k = 0; k < num_points; ++k) {
       c = k * 2;
       b[c] = points2[k][0];
       b[c + 1] = points2[k][1];
    }
 
-   float* transform = reg_matrix1DAllocate<float>(6);
-   reg_matrix2DVectorMultiply<float>(r, 6, num_equations, b, transform);
+   float* transform = Matrix1dAlloc<float>(6);
+   Matrix2dVectorMultiply<float>(r, 6, num_equations, b, transform);
 
    transformation->m[0][0] = transform[0];
    transformation->m[0][1] = transform[1];
@@ -514,19 +509,19 @@ void estimate_affine_transformation2D(float** points1, float** points2, int num_
    transformation->m[3][3] = 1.0f;
 
    // Do the deletion here
-   reg_matrix1DDeallocate(transform);
-   reg_matrix1DDeallocate(b);
-   reg_matrix2DDeallocate(6, r);
-   reg_matrix2DDeallocate(6, v);
-   reg_matrix1DDeallocate(w);
-   reg_matrix2DDeallocate(num_equations, A);
+   Matrix1dDealloc(transform);
+   Matrix1dDealloc(b);
+   Matrix2dDealloc(6, r);
+   Matrix2dDealloc(6, v);
+   Matrix1dDealloc(w);
+   Matrix2dDealloc(num_equations, A);
 }
 /* *************************************************************** */
 void estimate_affine_transformation2D(std::vector<_reg_sorted_point2D> &points, mat44 * transformation)
 {
    unsigned num_points = points.size();
-   float** points1 = reg_matrix2DAllocate<float>(num_points, 2);
-   float** points2 = reg_matrix2DAllocate<float>(num_points, 2);
+   float** points1 = Matrix2dAlloc<float>(num_points, 2);
+   float** points2 = Matrix2dAlloc<float>(num_points, 2);
    for (unsigned i = 0; i < num_points; i++) {
       points1[i][0] = points[i].reference[0];
       points1[i][1] = points[i].reference[1];
@@ -535,8 +530,8 @@ void estimate_affine_transformation2D(std::vector<_reg_sorted_point2D> &points,
    }
    estimate_affine_transformation2D(points1, points2, num_points, transformation);
    //FREE MEMORY
-   reg_matrix2DDeallocate(num_points, points1);
-   reg_matrix2DDeallocate(num_points, points2);
+   Matrix2dDealloc(num_points, points1);
+   Matrix2dDealloc(num_points, points2);
 }
 /* *************************************************************** */
 // estimate an affine transformation using least square
@@ -548,7 +543,7 @@ void estimate_affine_transformation3D(float** points1, float** points2, int num_
    // we need at least 4 points. Assuming we have that here.
    int num_equations = num_points * 3;
    unsigned c = 0;
-   float** A = reg_matrix2DAllocate<float>(num_equations, 12);
+   float** A = Matrix2dAlloc<float>(num_equations, 12);
 
    for (int k = 0; k < num_points; ++k) {
       c = k * 3;
@@ -571,10 +566,10 @@ void estimate_affine_transformation3D(float** points1, float** points2, int num_
       A[c + 2][11] = 1.0f;
    }
 
-   float* w = reg_matrix1DAllocate<float>(12);
-   float** v = reg_matrix2DAllocate<float>(12, 12);
-   // Now we can compute our svd
-   svd(A, num_equations, 12, w, v);
+   float* w = Matrix1dAlloc<float>(12);
+   float** v = Matrix2dAlloc<float>(12, 12);
+   // Now we can compute our Svd
+   Svd(A, num_equations, 12, w, v);
 
    // First we make sure that the really small singular values
    // are set to 0. and compute the inverse by taking the reciprocal
@@ -600,11 +595,11 @@ void estimate_affine_transformation3D(float** points1, float** points2, int num_
 
    // Now multiply the matrices together
    // Pseudoinverse = v * w * A(transpose)
-   float** r = reg_matrix2DAllocate<float>(12, num_equations);
-   reg_matrix2DMultiply<float>(v, 12, 12, A, num_equations, 12, r, true);
+   float** r = Matrix2dAlloc<float>(12, num_equations);
+   Matrix2dMultiply<float>(v, 12, 12, A, num_equations, 12, r, true);
    // Now r contains the pseudoinverse
    // Create vector b and then multiple rb to get the affine paramsA
-   float* b = reg_matrix1DAllocate<float>(num_equations);
+   float* b = Matrix1dAlloc<float>(num_equations);
    for (int k = 0; k < num_points; ++k) {
       c = k * 3;
       b[c] = points2[k][0];
@@ -612,9 +607,9 @@ void estimate_affine_transformation3D(float** points1, float** points2, int num_
       b[c + 2] = points2[k][2];
    }
 
-   float * transform = reg_matrix1DAllocate<float>(12);
+   float * transform = Matrix1dAlloc<float>(12);
    //mul_matvec(r, 12, num_equations, b, transform);
-   reg_matrix2DVectorMultiply<float>(r, 12, num_equations, b, transform);
+   Matrix2dVectorMultiply<float>(r, 12, num_equations, b, transform);
 
    transformation->m[0][0] = transform[0];
    transformation->m[0][1] = transform[1];
@@ -637,20 +632,20 @@ void estimate_affine_transformation3D(float** points1, float** points2, int num_
    transformation->m[3][3] = 1.0f;
 
    // Do the deletion here
-   reg_matrix1DDeallocate(transform);
-   reg_matrix1DDeallocate(b);
-   reg_matrix2DDeallocate(12, r);
-   reg_matrix2DDeallocate(12, v);
-   reg_matrix1DDeallocate(w);
-   reg_matrix2DDeallocate(num_equations, A);
+   Matrix1dDealloc(transform);
+   Matrix1dDealloc(b);
+   Matrix2dDealloc(12, r);
+   Matrix2dDealloc(12, v);
+   Matrix1dDealloc(w);
+   Matrix2dDealloc(num_equations, A);
 }
 /* *************************************************************** */
 // estimate an affine transformation using least square
 void estimate_affine_transformation3D(std::vector<_reg_sorted_point3D> &points, mat44 * transformation)
 {
    unsigned num_points = points.size();
-   float** points1 = reg_matrix2DAllocate<float>(num_points, 3);
-   float** points2 = reg_matrix2DAllocate<float>(num_points, 3);
+   float** points1 = Matrix2dAlloc<float>(num_points, 3);
+   float** points2 = Matrix2dAlloc<float>(num_points, 3);
    for (unsigned i = 0; i < num_points; i++) {
       points1[i][0] = points[i].reference[0];
       points1[i][1] = points[i].reference[1];
@@ -661,8 +656,8 @@ void estimate_affine_transformation3D(std::vector<_reg_sorted_point3D> &points,
    }
    estimate_affine_transformation3D(points1, points2, num_points, transformation);
    //FREE MEMORY
-   reg_matrix2DDeallocate(num_points, points1);
-   reg_matrix2DDeallocate(num_points, points2);
+   Matrix2dDealloc(num_points, points1);
+   Matrix2dDealloc(num_points, points2);
 }
 /* *************************************************************** */
 ///LTS 2D
@@ -671,7 +666,7 @@ void optimize_2D(float* referencePosition, float* warpedPosition,
                  mat44 * final, bool affine) {
 
    // Set the current transformation to identity
-   reg_mat44_eye(final);
+   Mat44Eye(final);
 
    const unsigned num_points = activeBlockNumber;
    unsigned long num_equations = num_points * 2;
@@ -705,13 +700,11 @@ void optimize_2D(float* referencePosition, float* warpedPosition,
    {
       // Transform the points in the reference
       for (unsigned j = 0; j < num_points * 2; j += 2)
-      {
-         reg_mat33_mul(final, &referencePosition[j], &newWarpedPosition[j]);
-      }
+         Mat33Mul(*final, reinterpret_cast<float(&)[2]>(referencePosition[j]), reinterpret_cast<float(&)[2]>(newWarpedPosition[j]));
       queue = std::multimap<double, _reg_sorted_point2D>();
       for (unsigned j = 0; j < num_points * 2; j += 2)
       {
-         distance = get_square_distance2D(&newWarpedPosition[j], &warpedPosition[j]);
+         distance = SquareDistance2d(&newWarpedPosition[j], &warpedPosition[j]);
          queue.insert(std::pair<double, _reg_sorted_point2D>(distance,
                                                              _reg_sorted_point2D(&referencePosition[j], &warpedPosition[j], distance)));
       }
@@ -754,7 +747,7 @@ void optimize_3D(float *referencePosition, float *warpedPosition,
                  mat44 *final, bool affine) {
 
    // Set the current transformation to identity
-   reg_mat44_eye(final);
+   Mat44Eye(final);
 
    const unsigned num_points = activeBlockNumber;
    unsigned long num_equations = num_points * 3;
@@ -785,13 +778,12 @@ void optimize_3D(float *referencePosition, float *warpedPosition,
    for (int count = 0; count < max_iter; ++count)
    {
       // Transform the points in the reference
-      for (unsigned j = 0; j < num_points * 3; j+=3) {
-         reg_mat44_mul(final, &referencePosition[j], &newWarpedPosition[j]);
-      }
+      for (unsigned j = 0; j < num_points * 3; j+=3)
+         Mat44Mul(*final, reinterpret_cast<float(&)[3]>(referencePosition[j]), reinterpret_cast<float(&)[3]>(newWarpedPosition[j]));
       queue = std::multimap<double, _reg_sorted_point3D>();
       for (unsigned j = 0; j < num_points * 3; j+= 3)
       {
-         distance = get_square_distance3D(&newWarpedPosition[j], &warpedPosition[j]);
+         distance = SquareDistance3d(&newWarpedPosition[j], &warpedPosition[j]);
          queue.insert(std::pair<double,
                       _reg_sorted_point3D>(distance,
                                            _reg_sorted_point3D(&referencePosition[j],
@@ -823,6 +815,6 @@ void optimize_3D(float *referencePosition, float *warpedPosition,
          estimate_rigid_transformation3D(top_points, final);
       }
    }
-   delete [] newWarpedPosition;
+   delete[] newWarpedPosition;
 }
 /* *************************************************************** */
diff --git a/reg-lib/cpu/_reg_localTrans.cpp b/reg-lib/cpu/_reg_localTrans.cpp
index c3e17149..d070bee1 100755
--- a/reg-lib/cpu/_reg_localTrans.cpp
+++ b/reg-lib/cpu/_reg_localTrans.cpp
@@ -11,7 +11,6 @@
  */
 
 #include "_reg_localTrans.h"
-#include "_reg_maths_eigen.h"
 
 // Due to SSE usage creates incorrect test results
 #if defined(BUILD_TESTS) && !defined(NDEBUG)
@@ -82,7 +81,7 @@ void reg_createControlPointGrid(NiftiImage& controlPointGridImage,
     originIndex[1] = -1.0f;
     originIndex[2] = 0.0f;
     if (referenceImage->nz > 1) originIndex[2] = -1.0f;
-    reg_mat44_mul(&controlPointGridImage->qto_xyz, originIndex, originReal);
+    Mat44Mul(controlPointGridImage->qto_xyz, originIndex, originReal);
     controlPointGridImage->qto_xyz.m[0][3] = controlPointGridImage->qoffset_x = originReal[0];
     controlPointGridImage->qto_xyz.m[1][3] = controlPointGridImage->qoffset_y = originReal[1];
     controlPointGridImage->qto_xyz.m[2][3] = controlPointGridImage->qoffset_z = originReal[2];
@@ -114,7 +113,7 @@ void reg_createControlPointGrid(NiftiImage& controlPointGridImage,
         controlPointGridImage->sto_xyz.m[3][3] = referenceImage->sto_xyz.m[3][3];
 
         // Origin is shifted from 1 control point in the sform
-        reg_mat44_mul(&controlPointGridImage->sto_xyz, originIndex, originReal);
+        Mat44Mul(controlPointGridImage->sto_xyz, originIndex, originReal);
         controlPointGridImage->sto_xyz.m[0][3] = originReal[0];
         controlPointGridImage->sto_xyz.m[1][3] = originReal[1];
         controlPointGridImage->sto_xyz.m[2][3] = originReal[2];
@@ -152,24 +151,24 @@ void reg_createSymmetricControlPointGrids(NiftiImage& forwardGridImage,
     mat44 halfForwardAffine, halfBackwardAffine;
     if (forwardAffineTrans != nullptr) {
         // Compute half of the affine transformation - ref to flo
-        halfForwardAffine = reg_mat44_logm(forwardAffineTrans);
-        halfForwardAffine = reg_mat44_mul(&halfForwardAffine, .5f);
-        halfForwardAffine = reg_mat44_expm(&halfForwardAffine);
+        halfForwardAffine = Mat44Logm(forwardAffineTrans);
+        halfForwardAffine = halfForwardAffine * 0.5f;
+        halfForwardAffine = Mat44Expm(&halfForwardAffine);
         // Compute half of the affine transformation - flo to ref
         // Note that this is done twice for symmetry consideration
         halfBackwardAffine = nifti_mat44_inverse(*forwardAffineTrans);
-        halfBackwardAffine = reg_mat44_logm(&halfBackwardAffine);
-        halfBackwardAffine = reg_mat44_mul(&halfBackwardAffine, .5f);
-        halfBackwardAffine = reg_mat44_expm(&halfBackwardAffine);
+        halfBackwardAffine = Mat44Logm(&halfBackwardAffine);
+        halfBackwardAffine = halfBackwardAffine * 0.5f;
+        halfBackwardAffine = Mat44Expm(&halfBackwardAffine);
         NR_WARN("Note that the symmetry of the registration is affected by the input affine transformation");
     } else {
-        reg_mat44_eye(&halfForwardAffine);
-        reg_mat44_eye(&halfBackwardAffine);
+        Mat44Eye(&halfForwardAffine);
+        Mat44Eye(&halfBackwardAffine);
     }
 
     // Update the reference and floating transformation to propagate to a mid space
-    referenceImageSpace = reg_mat44_mul(&halfForwardAffine, &referenceImageSpace);
-    floatingImageSpace = reg_mat44_mul(&halfBackwardAffine, &floatingImageSpace);
+    referenceImageSpace = halfForwardAffine * referenceImageSpace;
+    floatingImageSpace = halfBackwardAffine * floatingImageSpace;
 
     // Define the largest field of view in the mid space
     float minPosition[3] = { 0, 0, 0 }, maxPosition[3] = { 0, 0, 0 };
@@ -197,11 +196,11 @@ void reg_createSymmetricControlPointGrids(NiftiImage& forwardGridImage,
         };
         float out[3];
         for (int c = 0; c < 8; ++c) {
-            reg_mat44_mul(&referenceImageSpace, referenceImageCorners[c], out);
+            Mat44Mul(referenceImageSpace, referenceImageCorners[c], out);
             referenceImageCorners[c][0] = out[0];
             referenceImageCorners[c][1] = out[1];
             referenceImageCorners[c][2] = out[2];
-            reg_mat44_mul(&floatingImageSpace, floatingImageCorners[c], out);
+            Mat44Mul(floatingImageSpace, floatingImageCorners[c], out);
             floatingImageCorners[c][0] = out[0];
             floatingImageCorners[c][1] = out[1];
             floatingImageCorners[c][2] = out[2];
@@ -299,10 +298,10 @@ void reg_createSymmetricControlPointGrids(NiftiImage& forwardGridImage,
     // Set the control point grid image orientation
     forwardGridImage->qform_code = backwardGridImage->qform_code = 0;
     forwardGridImage->sform_code = backwardGridImage->sform_code = 1;
-    reg_mat44_eye(&forwardGridImage->sto_xyz);
-    reg_mat44_eye(&backwardGridImage->sto_xyz);
-    reg_mat44_eye(&forwardGridImage->sto_ijk);
-    reg_mat44_eye(&backwardGridImage->sto_ijk);
+    Mat44Eye(&forwardGridImage->sto_xyz);
+    Mat44Eye(&backwardGridImage->sto_xyz);
+    Mat44Eye(&forwardGridImage->sto_ijk);
+    Mat44Eye(&backwardGridImage->sto_ijk);
     for (unsigned i = 0; i < 3; ++i) {
         if (referenceImage->nz > 1 || i < 2) {
             forwardGridImage->sto_xyz.m[i][i] = backwardGridImage->sto_xyz.m[i][i] = spacing[i];
@@ -320,7 +319,7 @@ void reg_createSymmetricControlPointGrids(NiftiImage& forwardGridImage,
     forwardGridImage->intent_p1 = backwardGridImage->intent_p1 = CUB_SPLINE_GRID;
     // Set the affine matrices
     mat44 identity;
-    reg_mat44_eye(&identity);
+    Mat44Eye(&identity);
     if (forwardGridImage->ext_list != nullptr)
         free(forwardGridImage->ext_list);
     if (backwardGridImage->ext_list != nullptr)
@@ -1570,28 +1569,28 @@ void reg_voxelCentricToNodeCentric(nifti_image *nodeImage,
         if (nodeImage->ext_list[0].edata != nullptr) {
             mat44 temp = *(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
             temp = nifti_mat44_inverse(temp);
-            transformation = reg_mat44_mul(&temp, &transformation);
+            transformation = temp * transformation;
         }
     }
     // millimetre to voxel in the reference image
     if (voxelImage->sform_code > 0)
-        transformation = reg_mat44_mul(&voxelImage->sto_ijk, &transformation);
-    else transformation = reg_mat44_mul(&voxelImage->qto_ijk, &transformation);
+        transformation = voxelImage->sto_ijk * transformation;
+    else transformation = voxelImage->qto_ijk * transformation;
 
     // The information has to be reoriented
     mat33 reorientation;
     // Voxel to millimetre contains the orientation of the image that is used
     // to compute the spatial gradient (floating image)
     if (voxelToMillimetre != nullptr) {
-        reorientation = reg_mat44_to_mat33(voxelToMillimetre);
+        reorientation = Mat44ToMat33(voxelToMillimetre);
         if (nodeImage->num_ext > 0) {
             if (nodeImage->ext_list[0].edata != nullptr) {
-                mat33 temp = reg_mat44_to_mat33(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
+                mat33 temp = Mat44ToMat33(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
                 temp = nifti_mat33_inverse(temp);
                 reorientation = nifti_mat33_mul(temp, reorientation);
             }
         }
-    } else reg_mat33_eye(&reorientation);
+    } else Mat33Eye(&reorientation);
     // The information has to be weighted
     float ratio[3] = { nodeImage->dx, nodeImage->dy, nodeImage->dz };
     for (int i = 0; i < (nodeImage->nz > 1 ? 3 : 2); ++i) {
@@ -1611,7 +1610,7 @@ void reg_voxelCentricToNodeCentric(nifti_image *nodeImage,
             nodeCoord[1] = static_cast<float>(y);
             for (int x = 0; x < nodeImage->nx; x++) {
                 nodeCoord[0] = static_cast<float>(x);
-                reg_mat44_mul(&transformation, nodeCoord, voxelCoord);
+                Mat44Mul(transformation, nodeCoord, voxelCoord);
                 // linear interpolation is performed
                 DataType basisX[2], basisY[2], basisZ[2] = { 0, 0 };
                 int pre[3] = {
@@ -2173,7 +2172,7 @@ void reg_spline_refineControlPointGrid(nifti_image *controlPointGrid,
         originIndex[1] = -1.0f;
         originIndex[2] = 0.0f;
         if (referenceImage->nz > 1) originIndex[2] = -1.0f;
-        reg_mat44_mul(&(controlPointGrid->qto_xyz), originIndex, originReal);
+        Mat44Mul(controlPointGrid->qto_xyz, originIndex, originReal);
         if (controlPointGrid->qform_code == 0 && controlPointGrid->sform_code == 0)
             controlPointGrid->qform_code = 1;
         controlPointGrid->qto_xyz.m[0][3] = controlPointGrid->qoffset_x = originReal[0];
@@ -2211,7 +2210,7 @@ void reg_spline_refineControlPointGrid(nifti_image *controlPointGrid,
             float originIndex[3];
             originIndex[0] = originIndex[1] = originIndex[2] = -1;
             if (referenceImage->nz <= 1) originIndex[2] = 0;
-            reg_mat44_mul(&(controlPointGrid->sto_xyz), originIndex, originReal);
+            Mat44Mul(controlPointGrid->sto_xyz, originIndex, originReal);
             controlPointGrid->sto_xyz.m[0][3] = originReal[0];
             controlPointGrid->sto_xyz.m[1][3] = originReal[1];
             controlPointGrid->sto_xyz.m[2][3] = originReal[2];
@@ -2228,7 +2227,7 @@ void reg_spline_refineControlPointGrid(nifti_image *controlPointGrid,
         // The origin is shifted by one node when compared to the previous origin
         float nodeCoord[3] = { 1, 1, 1 };
         float newOrigin[3];
-        reg_mat44_mul(&controlPointGrid->sto_xyz, nodeCoord, newOrigin);
+        Mat44Mul(controlPointGrid->sto_xyz, nodeCoord, newOrigin);
         controlPointGrid->sto_xyz.m[0][3] = newOrigin[0];
         controlPointGrid->sto_xyz.m[1][3] = newOrigin[1];
         if (controlPointGrid->nz > 1)
@@ -2398,7 +2397,6 @@ void reg_defField_compose3D(const nifti_image *deformationField,
                 df_real2Voxel.m[2][1] * realDef[1] +
                 df_real2Voxel.m[2][2] * realDef[2] +
                 df_real2Voxel.m[2][3];
-            //reg_mat44_mul(df_real2Voxel, realDef, voxel);
 
             // Linear interpolation to compute the new deformation
             pre[0] = Floor(voxel[0]);
@@ -2501,7 +2499,7 @@ inline static int FastWarp(double x, double y, double z, nifti_image *deformatio
     FieldTYPE *wpz;
     int   xw, yw, zw, dxw, dyw, dxyw, dxyzw;
     double wxf, wyf, wzf, wyzf;
-    double world[4], position[4];
+    double world[3], position[3];
 
     FieldTYPE *warpdata = static_cast<FieldTYPE*>(deformationField->data);
 
@@ -2527,8 +2525,7 @@ inline static int FastWarp(double x, double y, double z, nifti_image *deformatio
     world[0] = x;
     world[1] = y;
     world[2] = z;
-    world[3] = 1;
-    reg_mat44_mul(deformationFieldIJKMatrix, world, position);
+    Mat44Mul(*deformationFieldIJKMatrix, world, position);
     x = position[0];
     y = position[1];
     z = position[2];
@@ -2947,13 +2944,12 @@ void reg_defFieldInvert3D(nifti_image *inputDeformationField,
     if (inputDeformationField->sform_code > 0)
         InXYZMatrix = &inputDeformationField->sto_xyz;
     else InXYZMatrix = &inputDeformationField->qto_xyz;
-    float center[4], center2[4];
-    double centerout[4], delta[4];
+    float center[3], center2[3];
+    double centerout[3], delta[3];
     center[0] = static_cast<float>(inputDeformationField->nx / 2);
     center[1] = static_cast<float>(inputDeformationField->ny / 2);
     center[2] = static_cast<float>(inputDeformationField->nz / 2);
-    center[3] = 1;
-    reg_mat44_mul(InXYZMatrix, center, center2);
+    Mat44Mul(*InXYZMatrix, center, center2);
     FastWarp<float>(center2[0], center2[1], center2[2], inputDeformationField, &centerout[0], &centerout[1], &centerout[2]);
     delta[0] = center2[0] - centerout[0];
     delta[1] = center2[1] - centerout[1];
@@ -2962,7 +2958,7 @@ void reg_defFieldInvert3D(nifti_image *inputDeformationField,
 
 
     int i, x, y, z;
-    double position[4], pars[4], arrayy[4][3];
+    double position[3], pars[3], arrayy[4][3];
     struct ddata dat;
     DataType *outData;
 #ifdef _OPENMP
@@ -2986,8 +2982,7 @@ void reg_defFieldInvert3D(nifti_image *inputDeformationField,
                 position[0] = x;
                 position[1] = y;
                 position[2] = z;
-                position[3] = 1;
-                reg_mat44_mul(OutXYZMatrix, position, pars);
+                Mat44Mul(*OutXYZMatrix, position, pars);
                 dat.gx = pars[0];
                 dat.gy = pars[1];
                 dat.gz = pars[2];
diff --git a/reg-lib/cpu/_reg_localTrans_jac.cpp b/reg-lib/cpu/_reg_localTrans_jac.cpp
index 75c0b6ee..303057cb 100755
--- a/reg-lib/cpu/_reg_localTrans_jac.cpp
+++ b/reg-lib/cpu/_reg_localTrans_jac.cpp
@@ -74,8 +74,8 @@ void reg_linear_spline_jacobian3D(nifti_image *splineControlPoint,
    // Define a matrix to reorient the Jacobian matrices and normalise them by the grid spacing
    mat33 reorientation,jacobianMatrix;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_ijk);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_ijk);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_ijk);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_ijk);
 
    // Useful variables
    int x, y, z;
@@ -143,13 +143,11 @@ void reg_linear_spline_jacobian3D(nifti_image *splineControlPoint,
          else transformation=referenceImage->qto_xyz;
          // affine: mm to mm
          if(splineControlPoint->num_ext>0)
-            transformation=reg_mat44_mul(
-                     reinterpret_cast<mat44 *>(splineControlPoint->ext_list[0].edata),
-                  &transformation);
+            transformation=reinterpret_cast<mat44&>(*splineControlPoint->ext_list[0].edata) * transformation;
          // grid: mm to voxel
          if(splineControlPoint->sform_code>0)
-            transformation=reg_mat44_mul(&(splineControlPoint->sto_ijk), &transformation);
-         else transformation=reg_mat44_mul(&(splineControlPoint->qto_ijk), &transformation);
+            transformation=splineControlPoint->sto_ijk * transformation;
+         else transformation=splineControlPoint->qto_ijk * transformation;
 
          float imageCoord[3], gridCoord[3];
          for(z=0; z<referenceImage->nz; z++)
@@ -163,7 +161,7 @@ void reg_linear_spline_jacobian3D(nifti_image *splineControlPoint,
                {
                   imageCoord[0]=x;
                   // Compute the position in the grid
-                  reg_mat44_mul(&transformation,imageCoord,gridCoord);
+                  Mat44Mul(transformation,imageCoord,gridCoord);
                   // Compute the anterior node coord
                   pre[0]=Floor(gridCoord[0]);
                   pre[1]=Floor(gridCoord[1]);
@@ -265,8 +263,8 @@ void reg_cubic_spline_jacobian2D(nifti_image *splineControlPoint,
    // Define a matrice to reorient the Jacobian matrices and normalise them by the grid spacing
    mat33 reorientation,jacobianMatrix;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_ijk);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_ijk);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_ijk);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_ijk);
 
    // Useful variables
    int x, y, incr0;
@@ -361,13 +359,11 @@ void reg_cubic_spline_jacobian2D(nifti_image *splineControlPoint,
          else transformation=referenceImage->qto_xyz;
          // affine: mm to mm
          if(splineControlPoint->num_ext>0)
-            transformation=reg_mat44_mul(
-                     reinterpret_cast<mat44 *>(splineControlPoint->ext_list[0].edata),
-                  &transformation);
+            transformation=reinterpret_cast<mat44&>(*splineControlPoint->ext_list[0].edata) * transformation;
          // grid: mm to voxel
          if(splineControlPoint->sform_code>0)
-            transformation=reg_mat44_mul(&(splineControlPoint->sto_ijk), &transformation);
-         else transformation=reg_mat44_mul(&(splineControlPoint->qto_ijk), &transformation);
+            transformation=splineControlPoint->sto_ijk * transformation;
+         else transformation=splineControlPoint->qto_ijk * transformation;
 
          float imageCoord[3], gridCoord[3], basis;
          imageCoord[2]=0;
@@ -380,7 +376,7 @@ void reg_cubic_spline_jacobian2D(nifti_image *splineControlPoint,
             {
                imageCoord[0]=x;
                // Compute the position in the grid
-               reg_mat44_mul(&transformation,imageCoord,gridCoord);
+               Mat44Mul(transformation,imageCoord,gridCoord);
                // Compute the anterior node coord
                pre[0]=Floor(gridCoord[0]);
                pre[1]=Floor(gridCoord[1]);
@@ -539,8 +535,8 @@ void reg_cubic_spline_jacobian3D(nifti_image *splineControlPoint,
    // Define a matrice to reorient the Jacobian matrices and normalise them by the grid spacing
    mat33 reorientation,jacobianMatrix;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_ijk);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_ijk);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_ijk);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_ijk);
 
    // Useful variables
    int x, y, z, incr0;
@@ -771,13 +767,11 @@ void reg_cubic_spline_jacobian3D(nifti_image *splineControlPoint,
          else transformation=referenceImage->qto_xyz;
          // affine: mm to mm
          if(splineControlPoint->num_ext>0)
-            transformation=reg_mat44_mul(
-                     reinterpret_cast<mat44 *>(splineControlPoint->ext_list[0].edata),
-                  &transformation);
+            transformation=reinterpret_cast<mat44&>(*splineControlPoint->ext_list[0].edata) * transformation;
          // grid: mm to voxel
          if(splineControlPoint->sform_code>0)
-            transformation=reg_mat44_mul(&(splineControlPoint->sto_ijk), &transformation);
-         else transformation=reg_mat44_mul(&(splineControlPoint->qto_ijk), &transformation);
+            transformation=splineControlPoint->sto_ijk * transformation;
+         else transformation=splineControlPoint->qto_ijk * transformation;
 
          float imageCoord[3], gridCoord[3], basis;
          for(z=0; z<referenceImage->nz; z++)
@@ -792,7 +786,7 @@ void reg_cubic_spline_jacobian3D(nifti_image *splineControlPoint,
                {
                   imageCoord[0]=x;
                   // Compute the position in the grid
-                  reg_mat44_mul(&transformation,imageCoord,gridCoord);
+                  Mat44Mul(transformation,imageCoord,gridCoord);
                   // Compute the anterior node coord
                   pre[0]=Floor(gridCoord[0]);
                   pre[1]=Floor(gridCoord[1]);
@@ -1342,8 +1336,8 @@ void reg_spline_jacobianDetGradient2D(nifti_image *splineControlPoint,
    // Matrices to be used to convert the gradient from voxel to mm
    mat33 jacobianMatrix, reorientation;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_xyz);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_xyz);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_xyz);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_xyz);
 
    // Ratio to be used for normalisation
    size_t jacobianNumber;
@@ -1580,8 +1574,8 @@ void reg_spline_jacobianDetGradient3D(nifti_image *splineControlPoint,
    // Matrices to be used to convert the gradient from voxel to mm
    mat33 jacobianMatrix, reorientation;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_xyz);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_xyz);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_xyz);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_xyz);
 
    // Ratio to be used for normalisation
    size_t jacobianNumber;
@@ -1946,8 +1940,8 @@ double reg_spline_correctFolding2D(nifti_image *splineControlPoint,
 
    mat33 jacobianMatrix, reorientation;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_xyz);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_xyz);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_xyz);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_xyz);
 
    const size_t nodeNumber = NiftiImage::calcVoxelNumber(splineControlPoint, 3);
    DataType *controlPointPtrX = static_cast<DataType *>(splineControlPoint->data);
@@ -2195,8 +2189,8 @@ double reg_spline_correctFolding3D(nifti_image *splineControlPoint,
 
    mat33 jacobianMatrix, reorientation;
    if(splineControlPoint->sform_code>0)
-      reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_xyz);
-   else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_xyz);
+      reorientation = Mat44ToMat33(&splineControlPoint->sto_xyz);
+   else reorientation = Mat44ToMat33(&splineControlPoint->qto_xyz);
 
    const size_t nodeNumber = NiftiImage::calcVoxelNumber(splineControlPoint, 3);
    DataType *controlPointPtrX = static_cast<DataType *>(splineControlPoint->data);
@@ -2629,13 +2623,13 @@ void reg_defField_getJacobianMap2D(nifti_image *deformationField,
    if(deformationField->sform_code>0)
    {
       reg_getRealImageSpacing(deformationField,spacing);
-      reorientation=nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->sto_xyz)));
+      reorientation=nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->sto_xyz)));
    }
    else
    {
       spacing[0]=deformationField->dx;
       spacing[1]=deformationField->dy;
-      reorientation=nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->qto_xyz)));
+      reorientation=nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->qto_xyz)));
    }
 
    DataType *deformationPtrX = static_cast<DataType *>(deformationField->data);
@@ -2738,14 +2732,14 @@ void reg_defField_getJacobianMap3D(nifti_image *deformationField,
    if(deformationField->sform_code>0)
    {
       reg_getRealImageSpacing(deformationField,spacing);
-      reorientation=nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->sto_xyz)));
+      reorientation=nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->sto_xyz)));
    }
    else
    {
       spacing[0]=deformationField->dx;
       spacing[1]=deformationField->dy;
       spacing[2]=deformationField->dz;
-      reorientation=nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->qto_xyz)));
+      reorientation=nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->qto_xyz)));
    }
 
    DataType *deformationPtrX = static_cast<DataType *>(deformationField->data);
@@ -2943,11 +2937,11 @@ void reg_defField_GetJacobianMatFromFlowField_core(mat33* jacobianMatrices,
 
    // The Jacobian matrices are initialised with identity or the initial affine
    mat33 affineMatrix;
-   reg_mat33_eye(&affineMatrix);
+   Mat33Eye(&affineMatrix);
    if(flowFieldImage->num_ext>0)
    {
       if(flowFieldImage->ext_list[0].edata!=nullptr)
-         affineMatrix = reg_mat44_to_mat33(reinterpret_cast<mat44 *>(flowFieldImage->ext_list[0].edata));
+         affineMatrix = Mat44ToMat33(reinterpret_cast<mat44 *>(flowFieldImage->ext_list[0].edata));
       else NR_FATAL_ERROR("The affine matrix is expected to be stored in the flow field");
    }
    const size_t voxelNumber = NiftiImage::calcVoxelNumber(flowFieldImage, 3);
@@ -2982,7 +2976,7 @@ void reg_defField_GetJacobianMatFromFlowField_core(mat33* jacobianMatrices,
    if(flowFieldImage->num_ext>1)
    {
       if(flowFieldImage->ext_list[1].edata!=nullptr)
-         affineMatrix = reg_mat44_to_mat33(reinterpret_cast<mat44 *>(flowFieldImage->ext_list[1].edata));
+         affineMatrix = Mat44ToMat33(reinterpret_cast<mat44 *>(flowFieldImage->ext_list[1].edata));
       else NR_FATAL_ERROR("The affine matrix is expected to be stored in the flow field");
       for(size_t i=0; i<voxelNumber; ++i)
          jacobianMatrices[i]=nifti_mat33_mul(affineMatrix,jacobianMatrices[i]);
diff --git a/reg-lib/cpu/_reg_localTrans_regul.cpp b/reg-lib/cpu/_reg_localTrans_regul.cpp
index 722add4e..907b3f3b 100755
--- a/reg-lib/cpu/_reg_localTrans_regul.cpp
+++ b/reg-lib/cpu/_reg_localTrans_regul.cpp
@@ -506,8 +506,8 @@ double reg_spline_approxLinearEnergyValue2D(const nifti_image *splineControlPoin
     // Matrix to use to convert the gradient from mm to voxel
     mat33 reorientation;
     if (splineControlPoint->sform_code > 0)
-        reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_ijk);
-    else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_ijk);
+        reorientation = Mat44ToMat33(&splineControlPoint->sto_ijk);
+    else reorientation = Mat44ToMat33(&splineControlPoint->qto_ijk);
 
 #ifdef _OPENMP
 #pragma omp parallel for default(none) \
@@ -583,8 +583,8 @@ double reg_spline_approxLinearEnergyValue3D(const nifti_image *splineControlPoin
     // Matrix to use to convert the gradient from mm to voxel
     mat33 reorientation;
     if (splineControlPoint->sform_code > 0)
-        reorientation = reg_mat44_to_mat33(&splineControlPoint->sto_ijk);
-    else reorientation = reg_mat44_to_mat33(&splineControlPoint->qto_ijk);
+        reorientation = Mat44ToMat33(&splineControlPoint->sto_ijk);
+    else reorientation = Mat44ToMat33(&splineControlPoint->qto_ijk);
 
 #ifdef _OPENMP
 #pragma omp parallel for default(none) \
@@ -688,7 +688,7 @@ void reg_spline_approxLinearEnergyGradient2D(const nifti_image *splineControlPoi
     set_first_order_basis_values(basisX, basisY);
 
     // Matrix to use to convert the gradient from mm to voxel
-    const mat33 reorientation = reg_mat44_to_mat33(splineControlPoint->sform_code > 0 ? &splineControlPoint->sto_ijk : &splineControlPoint->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(splineControlPoint->sform_code > 0 ? &splineControlPoint->sto_ijk : &splineControlPoint->qto_ijk);
     const mat33 invReorientation = nifti_mat33_inverse(reorientation);
 
     const DataType approxRatio = weight / static_cast<DataType>(nodeNumber);
@@ -756,7 +756,7 @@ void reg_spline_approxLinearEnergyGradient3D(const nifti_image *splineControlPoi
     set_first_order_basis_values(basisX, basisY, basisZ);
 
     // Matrix to use to convert the gradient from mm to voxel
-    const mat33 reorientation = reg_mat44_to_mat33(splineControlPoint->sform_code > 0 ? &splineControlPoint->sto_ijk : &splineControlPoint->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(splineControlPoint->sform_code > 0 ? &splineControlPoint->sto_ijk : &splineControlPoint->qto_ijk);
     const mat33 invReorientation = nifti_mat33_inverse(reorientation);
 
     const DataType approxRatio = weight / static_cast<DataType>(nodeNumber);
@@ -863,9 +863,9 @@ double reg_spline_getLandmarkDistance_core(const nifti_image *controlPointImage,
     const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3);
     double constraintValue = 0;
     size_t l, index;
-    float refPosition[4];
-    float defPosition[4];
-    float floPosition[4];
+    float refPosition[3];
+    float defPosition[3];
+    float floPosition[3];
     int previous[3], a, b, c;
     DataType basisX[4], basisY[4], basisZ[4], basis;
     const mat44 *gridRealToVox = &(controlPointImage->qto_ijk);
@@ -888,9 +888,8 @@ double reg_spline_getLandmarkDistance_core(const nifti_image *controlPointImage,
             refPosition[2] = landmarkReference[l * imageDim + 2];
             floPosition[2] = landmarkFloating[l * imageDim + 2];
         } else refPosition[2] = floPosition[2] = 0;
-        refPosition[3] = floPosition[3] = 1;
         // Convert the reference position to voxel in the control point grid space
-        reg_mat44_mul(gridRealToVox, refPosition, defPosition);
+        Mat44Mul(*gridRealToVox, refPosition, defPosition);
 
         // Extract the corresponding nodes
         previous[0] = Floor(defPosition[0]) - 1;
@@ -1003,7 +1002,7 @@ void reg_spline_getLandmarkDistanceGradient_core(const nifti_image *controlPoint
             floPosition[2] = landmarkFloating[l * imageDim + 2];
         } else refPosition[2] = floPosition[2] = 0;
         // Convert the reference position to voxel in the control point grid space
-        reg_mat44_mul(gridRealToVox, refPosition, defPosition);
+        Mat44Mul(*gridRealToVox, refPosition, defPosition);
         if (imageDim == 2) defPosition[2] = 0;
         // Extract the corresponding nodes
         previous[0] = Floor(defPosition[0]) - 1;
diff --git a/reg-lib/cpu/_reg_maths.h b/reg-lib/cpu/_reg_maths.h
deleted file mode 100644
index 42c0cddd..00000000
--- a/reg-lib/cpu/_reg_maths.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/**
- * @file _reg_maths.h
- * @brief Library that contains small math routines
- * @author Marc Modat
- * @date 25/03/2009
- *
- *  Created by Marc Modat on 25/03/2009.
- *  Copyright (c) 2009-2018, University College London
- *  Copyright (c) 2018, NiftyReg Developers.
- *  All rights reserved.
- *  See the LICENSE.txt file in the nifty_reg root folder
- *
- */
-
-#pragma once
-
-#include "RNifti.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#if USE_SSE
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-#endif
-
-#define _USE_MATH_DEFINES
-#include <math.h>
-
-#ifdef __CUDACC__
-#define DEVICE  __host__ __device__
-#else
-#define DEVICE
-#endif
-
-typedef enum {
-    DEF_FIELD,
-    DISP_FIELD,
-    CUB_SPLINE_GRID,
-    DEF_VEL_FIELD,
-    DISP_VEL_FIELD,
-    SPLINE_VEL_GRID,
-    LIN_SPLINE_GRID
-} NREG_TRANS_TYPE;
-
-/* *************************************************************** */
-namespace NiftyReg {
-/* *************************************************************** */
-// The functions in the standard library are slower; so, these are implemented
-template<typename T>
-DEVICE inline T Square(const T& x) {
-    return x * x;
-}
-template<typename T>
-DEVICE inline T Cube(const T& x) {
-    return x * x * x;
-}
-template<typename T>
-DEVICE inline int Floor(const T& x) {
-    const int i = static_cast<int>(x);
-    return i - (x < i);
-}
-template<typename T>
-DEVICE inline int Ceil(const T& x) {
-    const int i = static_cast<int>(x);
-    return i + (x > i);
-}
-template<typename T>
-DEVICE inline int Round(const T& x) {
-    return static_cast<int>(x + (x >= 0 ? 0.5 : -0.5));
-}
-/* *************************************************************** */
-} // namespace NiftyReg
-/* *************************************************************** */
-template<class T>
-T* reg_matrix1DAllocate(size_t arraySize);
-/* *************************************************************** */
-template<class T>
-void reg_matrix1DDeallocate(T* mat);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DAllocate(size_t arraySizeX, size_t arraySizeY);
-/* *************************************************************** */
-template<class T>
-void reg_matrix2DDeallocate(size_t arraySizeX, T** mat);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DTranspose(T** mat, size_t arraySizeX, size_t arraySizeY);
-/* *************************************************************** */
-template<class T>
-T** reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, bool transposeMat2);
-template<class T>
-void reg_matrix2DMultiply(T** mat1, size_t mat1X, size_t mat1Y, T** mat2, size_t mat2X, size_t mat2Y, T** res, bool transposeMat2);
-/* *************************************************************** */
-template<class T>
-T* reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect);
-template<class T>
-void reg_matrix2DVectorMultiply(T** mat, size_t m, size_t n, T* vect, T* res);
-/* *************************************************************** */
-/** @brief Add two 3-by-3 matrices
-*/
-mat33 reg_mat33_add(mat33 const* A, mat33 const* B);
-mat33 operator+(mat33 A, mat33 B);
-/* *************************************************************** */
-/** @brief Multiply two 3-by-3 matrices
-*/
-mat33 reg_mat33_mul(mat33 const* A,
-    mat33 const* B);
-mat33 operator*(mat33 A,
-    mat33 B);
-/* *************************************************************** */
-//The mat33 represent a 3x3 matrix
-void reg_mat33_mul(mat44 const* mat, float const* in, float *out);
-void reg_mat33_mul(mat33 const* mat, float const* in, float *out);
-/* *************************************************************** */
-/** @brief Subtract two 3-by-3 matrices
-*/
-mat33 reg_mat33_minus(mat33 const* A, mat33 const* B);
-mat33 operator-(mat33 A, mat33 B);
-/* *************************************************************** */
-/** @brief Transpose a 3-by-3 matrix
-*/
-mat33 reg_mat33_trans(mat33 A);
-/* *************************************************************** */
-/** @brief Diagonalize a 3-by-3 matrix
-*/
-void reg_mat33_diagonalize(mat33 const* A, mat33 * Q, mat33 * D);
-/* *************************************************************** */
-/** @brief Set up a 3-by-3 matrix with an identity
-*/
-void reg_mat33_eye(mat33 *mat);
-/* *************************************************************** */
-/** @brief Compute the determinant of a 3-by-3 matrix
-*/
-void reg_mat33_to_nan(mat33 *A);
-/* *************************************************************** */
-/** @brief Transform a mat44 to a mat33 matrix
-*/
-mat33 reg_mat44_to_mat33(mat44 const* A);
-void reg_heapSort(float *array_tmp, int *index_tmp, int blockNum);
-/* *************************************************************** */
-template <class T>
-void reg_heapSort(T *array_tmp,int blockNum);
-/* *************************************************************** */
-bool operator==(mat44 A,mat44 B);
-/* *************************************************************** */
-bool operator!=(mat44 A,mat44 B);
-/* *************************************************************** */
-/** @brief Multiply two 4-by-4 matrices
- */
-mat44 reg_mat44_mul(mat44 const* A,
-                    mat44 const* B);
-mat44 operator*(mat44 A,
-                mat44 B);
-/* *************************************************************** */
-/** @brief Multiply a vector with a 4-by-4 matrix
- */
-void reg_mat44_mul(mat44 const* mat,
-                   float const* in,
-                   float *out);
-
-void reg_mat44_mul(mat44 const* mat,
-                   double const* in,
-                   double *out);
-/* *************************************************************** */
-/** @brief Multiply a 4-by-4 matrix with a scalar
- */
-mat44 reg_mat44_mul(mat44 const* mat,
-                    double scalar);
-/* *************************************************************** */
-/** @brief Add two 4-by-4 matrices
- */
-mat44 reg_mat44_add(mat44 const* A, mat44 const* B);
-mat44 operator+(mat44 A,mat44 B);
-/* *************************************************************** */
-/** @brief Subtract two 4-by-4 matrices
- */
-mat44 reg_mat44_minus(mat44 const* A, mat44 const* B);
-mat44 operator-(mat44 A,mat44 B);
-/* *************************************************************** */
-/** @brief Set up a 4-by-4 matrix with an identity
- */
-void reg_mat44_eye(mat44 *mat);
-/* *************************************************************** */
-/** @brief Compute the determinant of a 4-by-4 matrix
- */
-template<class T> T reg_mat44_det(mat44 const* A);
-/* *************************************************************** */
-/** @brief Display a mat44 matrix
- */
-void reg_mat44_disp(const mat44& mat, const std::string& title);
-/* *************************************************************** */
-double get_square_distance3D(float * first_point3D, float * second_point3D);
-/* *************************************************************** */
-double get_square_distance2D(float * first_point2D, float * second_point2D);
-/* *************************************************************** */
diff --git a/reg-lib/cpu/_reg_maths_eigen.cpp b/reg-lib/cpu/_reg_maths_eigen.cpp
deleted file mode 100644
index 444a1721..00000000
--- a/reg-lib/cpu/_reg_maths_eigen.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#define USE_EIGEN
-
-#include "_reg_maths_eigen.h"
-#include "_reg_maths.h"
-#include "Debug.hpp"
-
-// Eigen headers are in there because of the nvcc preprocessing step
-#include "Eigen/Core"
-#include "Eigen/SVD"
-#include "unsupported/Eigen/MatrixFunctions"
-
-//_reg_maths_eigen.cpp
-/* *************************************************************** */
-/** @brief SVD
-* @param in input matrix to decompose - in place
-* @param size_m row
-* @param size_n colomn
-* @param w diagonal term
-* @param v rotation part
-*/
-template<class T>
-void svd(T **in, size_t size_m, size_t size_n, T * w, T **v) {
-   if (size_m == 0 || size_n == 0)
-      NR_FATAL_ERROR("The specified matrix is empty");
-
-#ifdef _WIN32
-   long sm, sn, sn2;
-   long size__m = (long)size_m, size__n = (long)size_n;
-#else
-   size_t sm, sn, sn2;
-   size_t size__m = size_m, size__n = size_n;
-#endif
-   Eigen::MatrixXd m(size_m, size_n);
-
-   //Convert to Eigen matrix
-#ifdef _OPENMP
-#pragma omp parallel for default(none) \
-   shared(in,m, size__m, size__n) \
-   private(sn)
-#endif
-   for (sm = 0; sm < size__m; sm++)
-   {
-      for (sn = 0; sn < size__n; sn++)
-      {
-         m(sm, sn) = static_cast<double>(in[sm][sn]);
-      }
-   }
-
-   Eigen::JacobiSVD<Eigen::MatrixXd> svd(m, Eigen::ComputeThinU | Eigen::ComputeThinV);
-
-#ifdef _OPENMP
-#pragma omp parallel for default(none) \
-   shared(in,svd,v,w, size__n,size__m) \
-   private(sn2, sm)
-#endif
-   for (sn = 0; sn < size__n; sn++) {
-      w[sn] = static_cast<T>(svd.singularValues()(sn));
-      for (sn2 = 0; sn2 < size__n; sn2++) {
-         v[sn2][sn] = static_cast<T>(svd.matrixV()(sn2, sn));
-      }
-      for (sm = 0; sm < size__m; sm++) {
-         in[sm][sn] = static_cast<T>(svd.matrixU()(sm, sn));
-      }
-   }
-}
-template void svd<float>(float **in, size_t m, size_t n, float * w, float **v);
-template void svd<double>(double **in, size_t m, size_t n, double * w, double **v);
-/* *************************************************************** */
-template<class T>
-T reg_matrix2DDet(T** mat, size_t m, size_t n) {
-   if (m != n)
-      NR_FATAL_ERROR("The matrix have to be square: [" + std::to_string(m) + " " + std::to_string(n) + "]");
-
-   double res;
-   if (m == 2) {
-      res = static_cast<double>(mat[0][0]) * static_cast<double>(mat[1][1]) - static_cast<double>(mat[1][0]) * static_cast<double>(mat[0][1]);
-   }
-   else if (m == 3) {
-      res = (static_cast<double>(mat[0][0]) * (static_cast<double>(mat[1][1]) * static_cast<double>(mat[2][2]) - static_cast<double>(mat[1][2]) * static_cast<double>(mat[2][1]))) -
-            (static_cast<double>(mat[0][1]) * (static_cast<double>(mat[1][0]) * static_cast<double>(mat[2][2]) - static_cast<double>(mat[1][2]) * static_cast<double>(mat[2][0]))) +
-            (static_cast<double>(mat[0][2]) * (static_cast<double>(mat[1][0]) * static_cast<double>(mat[2][1]) - static_cast<double>(mat[1][1]) * static_cast<double>(mat[2][0])));
-   }
-   else {
-      // Convert to Eigen format
-      Eigen::MatrixXd eigenRes(m, n);
-      for (size_t i = 0; i < m; i++) {
-         for (size_t j = 0; j < n; j++) {
-            eigenRes(i, j) = static_cast<double>(mat[i][j]);
-         }
-      }
-      res = eigenRes.determinant();
-   }
-   return static_cast<T>(res);
-}
-template float reg_matrix2DDet<float>(float** mat, size_t m, size_t n);
-template double reg_matrix2DDet<double>(double** mat, size_t m, size_t n);
-/* *************************************************************** */
-void reg_mat33_expm(mat33 *in_tensor)
-{
-   int sm, sn;
-   Eigen::Matrix3d tensor;
-
-   // Convert to Eigen format
-   for (sm = 0; sm < 3; sm++){
-      for (sn = 0; sn < 3; sn++){
-         float val=in_tensor->m[sm][sn];
-         if(val!=val) return;
-         tensor(sm, sn) = static_cast<double>(val);
-      }
-   }
-
-   // Compute exp(E)
-   tensor = tensor.exp();
-
-   // Convert the result to mat33 format
-   for (sm = 0; sm < 3; sm++)
-      for (sn = 0; sn < 3; sn++)
-         in_tensor->m[sm][sn] = static_cast<float>(tensor(sm, sn));
-}
-/* *************************************************************** */
-mat44 reg_mat44_expm(mat44 const* mat)
-{
-   mat44 X;
-   Eigen::Matrix4d m;
-   for (size_t i = 0; i < 4; ++i) {
-      for (size_t j = 0; j < 4; ++j) {
-         m(i, j) = static_cast<double>(mat->m[i][j]);
-      }
-   }
-   m = m.exp();
-   //
-   for (size_t i = 0; i < 4; ++i)
-      for (size_t j = 0; j < 4; ++j)
-         X.m[i][j] = static_cast<float>(m(i, j));
-
-   return X;
-}
-/* *************************************************************** */
-void reg_mat33_logm(mat33 *in_tensor)
-{
-   int sm, sn;
-   Eigen::Matrix3d tensor;
-
-   // Convert to Eigen format
-   bool all_zeros = true;
-   double det = 0;
-   for (sm = 0; sm < 3; sm++){
-      for (sn = 0; sn < 3; sn++){
-         float val=in_tensor->m[sm][sn];
-         if(val!=0.f) all_zeros=false;
-         if(val!=val) return;
-         tensor(sm, sn) = static_cast<double>(val);
-      }
-   }
-   // Actually R case requires invertible and no negative real ev,
-   // but the only observed case so far was non-invertible.
-   // determinant is not a perfect check for invertibility and
-   // identity with zero not great either, but the alternative
-   // is a general eigensolver and the logarithm function should
-   // suceed unless convergence just isn't happening.
-   det = tensor.determinant();
-   if(all_zeros || det == 0){
-      reg_mat33_to_nan(in_tensor);
-      return;
-   }
-
-   // Compute the actual matrix log
-   tensor = tensor.log();
-
-   // Convert the result to mat33 format
-   for (sm = 0; sm < 3; sm++)
-      for (sn = 0; sn < 3; sn++)
-         in_tensor->m[sm][sn] = static_cast<float>(tensor(sm, sn));
-}
-/* *************************************************************** */
-mat44 reg_mat44_logm(mat44 const* mat)
-{
-   mat44 X;
-   Eigen::Matrix4d m;
-   for (size_t i = 0; i < 4; ++i) {
-      for (size_t j = 0; j < 4; ++j) {
-         m(i, j) = static_cast<double>(mat->m[i][j]);
-      }
-   }
-   m = m.log();
-   for (size_t i = 0; i < 4; ++i)
-      for (size_t j = 0; j < 4; ++j)
-         X.m[i][j] = static_cast<float>(m(i, j));
-   return X;
-}
-/* *************************************************************** */
-mat44 reg_mat44_avg2(mat44 const* A, mat44 const* B)
-{
-   mat44 out;
-   mat44 logA = reg_mat44_logm(A);
-   mat44 logB = reg_mat44_logm(B);
-   for (int i = 0; i < 4; ++i) {
-      logA.m[3][i] = 0.f;
-      logB.m[3][i] = 0.f;
-   }
-   logA = reg_mat44_add(&logA, &logB);
-   out = reg_mat44_mul(&logA, 0.5);
-   return reg_mat44_expm(&out);
-
-}
diff --git a/reg-lib/cpu/_reg_maths_eigen.h b/reg-lib/cpu/_reg_maths_eigen.h
deleted file mode 100644
index 20867b69..00000000
--- a/reg-lib/cpu/_reg_maths_eigen.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include "RNifti.h"
-
-/* *************************************************************** */
-/* Functions calling the Eigen library                             */
-/* See http://eigen.tuxfamily.org/index.php?title=Main_Page        */
-/* *************************************************************** */
-
-/* *************************************************************** */
-template <class T>
-void svd(T **in, size_t m, size_t n, T * w, T **v);
-/* *************************************************************** */
-template<class T>
-T reg_matrix2DDet(T** mat, size_t m, size_t n);
-/* *************************************************************** */
-/** @brief Compute the log of a 3-by-3 matrix
-*/
-void reg_mat33_expm(mat33 *in_tensor);
-/* *************************************************************** */
-/** @brief Compute the exp of a 4-by-4 matrix
-*/
-mat44 reg_mat44_expm(const mat44 *mat);
-/* *************************************************************** */
-/** @brief Compute the log of a 3-by-3 matrix
-*/
-void reg_mat33_logm(mat33 *in_tensor);
-/* *************************************************************** */
-/** @brief Compute the log of a 4-by-4 matrix
-*/
-mat44 reg_mat44_logm(const mat44 *mat);
-/* *************************************************************** */
-/** @brief Compute the average of two matrices using a log-euclidean
-* framework
-*/
-mat44 reg_mat44_avg2(mat44 const* A, mat44 const* b);
diff --git a/reg-lib/cpu/_reg_resampling.cpp b/reg-lib/cpu/_reg_resampling.cpp
index 0d9d1785..6fe684c5 100755
--- a/reg-lib/cpu/_reg_resampling.cpp
+++ b/reg-lib/cpu/_reg_resampling.cpp
@@ -11,8 +11,7 @@
  */
 
 #include "_reg_resampling.h"
-#include "_reg_maths.h"
-#include "_reg_maths_eigen.h"
+#include "Maths.hpp"
 #include "_reg_tools.h"
 
 #define SINC_KERNEL_RADIUS 3
@@ -165,7 +164,7 @@ void reg_dti_resampling_preprocessing(nifti_image *floatingImage,
             diffTensor[tid].m[2][2] = static_cast<float>(floatingIntensityZZ[floatingIndex]);
 
             // Compute the log of the diffusion tensor.
-            reg_mat33_logm(&diffTensor[tid]);
+            Mat33Logm(&diffTensor[tid]);
 
             // Write this out as a new image
             floatingIntensityXX[floatingIndex] = static_cast<DataType>(diffTensor[tid].m[0][0]);
@@ -257,10 +256,10 @@ void reg_dti_resampling_postprocessing(nifti_image *inputImage,
                     inputTensor[tid].m[2][2] = static_cast<float>(inputIntensityZZ[warpedIndex]);
                     // Exponentiate the warped tensor
                     if (warpedImage == nullptr) {
-                        reg_mat33_expm(&inputTensor[tid]);
+                        Mat33Expm(&inputTensor[tid]);
                         testSum = 0;
                     } else {
-                        reg_mat33_eye(&warpedTensor[tid]);
+                        Mat33Eye(&warpedTensor[tid]);
                         warpedTensor[tid].m[0][0] = static_cast<float>(warpedXX[warpedIndex]);
                         warpedTensor[tid].m[0][1] = static_cast<float>(warpedXY[warpedIndex]);
                         warpedTensor[tid].m[1][0] = warpedTensor[tid].m[0][1];
@@ -396,7 +395,7 @@ void ResampleImage3D(const nifti_image *floatingImage,
                 world[2] = static_cast<float>(deformationFieldPtrZ[index]);
 
                 // real -> voxel; floating space
-                reg_mat44_mul(floatingIJKMatrix, world, position);
+                Mat44Mul(*floatingIJKMatrix, world, position);
 
                 previous[0] = Floor(position[0]);
                 previous[1] = Floor(position[1]);
@@ -576,7 +575,7 @@ void ResampleImage2D(const nifti_image *floatingImage,
                 world[2] = 0;
 
                 // real -> voxel; floating space
-                reg_mat44_mul(floatingIJKMatrix, world, position);
+                Mat44Mul(*floatingIJKMatrix, world, position);
 
                 previous[0] = Floor(position[0]);
                 previous[1] = Floor(position[1]);
@@ -921,7 +920,7 @@ void ResampleImage3D_PSF_Sinc(const nifti_image *floatingImage,
                                     psfWorld[2] /= resamplingWeightSum;
 
                                     // real -> voxel; floating space
-                                    reg_mat44_mul(floatingIJKMatrix, psfWorld, position);
+                                    Mat44Mul(*floatingIJKMatrix, psfWorld, position);
 
                                     previous[0] = Floor(position[0]);
                                     previous[1] = Floor(position[1]);
@@ -1128,11 +1127,11 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
                     // T=P+A*S*At
                     A = nifti_mat33_inverse(jacMat[index]);
 
-                    ASAt = A * S * reg_mat33_trans(A);
+                    ASAt = A * S * Mat33Trans(A);
 
                     TmS = T - ASAt;
 
-                    reg_mat33_diagonalize(&TmS, &TmS_EigVec, &TmS_EigVal);
+                    Mat33Diagonalize(&TmS, &TmS_EigVec, &TmS_EigVal);
 
                     // If eigen values are less than 0, set them to 0.
                     // Also, invert the eigenvalues to estimate the inverse.
@@ -1148,7 +1147,7 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
                         }
                     }
 
-                    TmS_EigVec_trans = reg_mat33_trans(TmS_EigVec);
+                    TmS_EigVec_trans = Mat33Trans(TmS_EigVec);
                     P = TmS_EigVec * TmS_EigVal * TmS_EigVec_trans;
                     invP = TmS_EigVec * TmS_EigVal_inv * TmS_EigVec_trans;
                     currentDeterminant = TmS_EigVal.m[0][0] * TmS_EigVal.m[1][1] * TmS_EigVal.m[2][2];
@@ -1157,16 +1156,16 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
 
                     A = nifti_mat33_inverse(jacMat[index]);
 
-                    ASAt = A * S * reg_mat33_trans(A);
+                    ASAt = A * S * Mat33Trans(A);
 
                     mat33 S_EigVec, S_EigVal;
 
                     //                % rotate S
                     //                [ZS, DS] = eig(S);
-                    reg_mat33_diagonalize(&ASAt, &S_EigVec, &S_EigVal);
+                    Mat33Diagonalize(&ASAt, &S_EigVec, &S_EigVal);
 
                     //                T1 = ZS'*T*ZS;
-                    mat33 T1 = reg_mat33_trans(S_EigVec) * T * S_EigVec;
+                    mat33 T1 = Mat33Trans(S_EigVec) * T * S_EigVec;
 
                     //                % Volume-preserving scale of S to make it isotropic
                     //                detS = prod(diag(DS));
@@ -1191,12 +1190,12 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
                     }
 
                     //                T2 = LambdaN*T1*LambdaN';
-                    mat33 T2 = LambdaN * T1 * reg_mat33_trans(LambdaN);
+                    mat33 T2 = LambdaN * T1 * Mat33Trans(LambdaN);
 
                     //                % Rotate to make thing axis-aligned
                     //                [ZT2, DT2] = eig(T2);
                     mat33 T2_EigVec, T2_EigVal;
-                    reg_mat33_diagonalize(&T2, &T2_EigVec, &T2_EigVal);
+                    Mat33Diagonalize(&T2, &T2_EigVec, &T2_EigVal);
 
                     //                % Optimal solution in the transformed axis-aligned space
                     //                DP2 = diag(max(sqrt(detS),diag(DT2)));
@@ -1213,11 +1212,11 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
 
                     //                % Roll back the transforms
                     //                Q = ZS*invLambdaN*ZT2*DQ2*ZT2'*invLambdaN*ZS'
-                    mat33 Q = S_EigVec * invLambdaN * T2_EigVec * DP2 * reg_mat33_trans(T2_EigVec) * invLambdaN * reg_mat33_trans(S_EigVec);
+                    mat33 Q = S_EigVec * invLambdaN * T2_EigVec * DP2 * Mat33Trans(T2_EigVec) * invLambdaN * Mat33Trans(S_EigVec);
                     //                P=Q-S
                     TmS = Q - S;
                     invP = nifti_mat33_inverse(TmS);
-                    reg_mat33_diagonalize(&TmS, &TmS_EigVec, &TmS_EigVal);
+                    Mat33Diagonalize(&TmS, &TmS_EigVec, &TmS_EigVal);
 
                     currentDeterminant = TmS_EigVal.m[0][0] * TmS_EigVal.m[1][1] * TmS_EigVal.m[2][2];
                     currentDeterminant = currentDeterminant < 0.000001f ? 0.000001f : currentDeterminant;
@@ -1322,7 +1321,7 @@ void ResampleImage3D_PSF(const nifti_image *floatingImage,
                                         psfWorld[2] /= resamplingWeightSum;
 
                                         // real -> voxel; floating space
-                                        reg_mat44_mul(floatingIJKMatrix, psfWorld, position);
+                                        Mat44Mul(*floatingIJKMatrix, psfWorld, position);
 
                                         previous[0] = Floor(position[0]);
                                         previous[1] = Floor(position[1]);
@@ -1531,7 +1530,7 @@ void reg_bilinearResampleGradient(const nifti_image *floatingImage,
     }
 
     // Reorientation matrix is assessed in order to remove the rigid component
-    mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->sto_xyz)));
+    mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->sto_xyz)));
 
     // Some useful variables
     mat33 jacMat;
@@ -1701,7 +1700,7 @@ void reg_trilinearResampleGradient(const nifti_image *floatingImage,
     }
 
     // Reorientation matrix is assessed in order to remove the rigid component
-    mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->sto_xyz)));
+    mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->sto_xyz)));
 
     // Some useful variables
     mat33 jacMat;
@@ -1980,7 +1979,7 @@ void TrilinearImageGradient(const nifti_image *floatingImage,
             world[2] = (FieldType)deformationFieldPtrZ[index];
 
             /* real -> voxel; floating space */
-            reg_mat44_mul(floatingIJKMatrix, world, position);
+            Mat44Mul(*floatingIJKMatrix, world, position);
 
             previous[0] = Floor(position[0]);
             previous[1] = Floor(position[1]);
@@ -2257,7 +2256,7 @@ void CubicSplineImageGradient3D(const nifti_image *floatingImage,
             world[2] = (FieldType)deformationFieldPtrZ[index];
 
             /* real -> voxel; floating space */
-            reg_mat44_mul(floatingIJKMatrix, world, position);
+            Mat44Mul(*floatingIJKMatrix, world, position);
 
             previous[0] = Floor(position[0]);
             previous[1] = Floor(position[1]);
diff --git a/reg-lib/cpu/_reg_ssd.cpp b/reg-lib/cpu/_reg_ssd.cpp
index b000fbd4..2a4bddfb 100755
--- a/reg-lib/cpu/_reg_ssd.cpp
+++ b/reg-lib/cpu/_reg_ssd.cpp
@@ -349,7 +349,7 @@ void GetDiscretisedValueSSD_core3D(nifti_image *controlPointGridImage,
     mat44 *image_mm2vox = &refImage->qto_ijk;
     if (refImage->sform_code > 0)
         image_mm2vox = &refImage->sto_ijk;
-    mat44 grid2img_vox = reg_mat44_mul(image_mm2vox, grid_vox2mm);
+    mat44 grid2img_vox = *image_mm2vox * *grid_vox2mm;
 
     // Compute the block size
     const int blockSize[3] = {
@@ -404,7 +404,7 @@ void GetDiscretisedValueSSD_core3D(nifti_image *controlPointGridImage,
                     controlPointGridImage->nx * cpy + cpx;
 
                 // Compute the corresponding image voxel position
-                reg_mat44_mul(&grid2img_vox, gridVox, imageVox);
+                Mat44Mul(grid2img_vox, gridVox, imageVox);
                 imageVox[0] = static_cast<float>(Round(imageVox[0]));
                 imageVox[1] = static_cast<float>(Round(imageVox[1]));
                 imageVox[2] = static_cast<float>(Round(imageVox[2]));
diff --git a/reg-lib/cpu/_reg_tools.cpp b/reg-lib/cpu/_reg_tools.cpp
index 017d6029..21aa5869 100755
--- a/reg-lib/cpu/_reg_tools.cpp
+++ b/reg-lib/cpu/_reg_tools.cpp
@@ -253,22 +253,22 @@ void reg_tools_removeSCLInfo(nifti_image *image) {
 void reg_getRealImageSpacing(nifti_image *image, float *spacingValues) {
     float indexVoxel1[3] = { 0, 0, 0 };
     float indexVoxel2[3], realVoxel1[3], realVoxel2[3];
-    reg_mat44_mul(&(image->sto_xyz), indexVoxel1, realVoxel1);
+    Mat44Mul(image->sto_xyz, indexVoxel1, realVoxel1);
 
     indexVoxel2[1] = indexVoxel2[2] = 0;
     indexVoxel2[0] = 1;
-    reg_mat44_mul(&(image->sto_xyz), indexVoxel2, realVoxel2);
+    Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
     spacingValues[0] = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
 
     indexVoxel2[0] = indexVoxel2[2] = 0;
     indexVoxel2[1] = 1;
-    reg_mat44_mul(&(image->sto_xyz), indexVoxel2, realVoxel2);
+    Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
     spacingValues[1] = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
 
     if (image->nz > 1) {
         indexVoxel2[0] = indexVoxel2[1] = 0;
         indexVoxel2[2] = 1;
-        reg_mat44_mul(&(image->sto_xyz), indexVoxel2, realVoxel2);
+        Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
         spacingValues[2] = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
     }
 }
diff --git a/reg-lib/cpu/_reg_tools.h b/reg-lib/cpu/_reg_tools.h
index 650e6c71..f0475e5d 100755
--- a/reg-lib/cpu/_reg_tools.h
+++ b/reg-lib/cpu/_reg_tools.h
@@ -20,7 +20,7 @@
 #include <cmath>
 #include <algorithm>
 #include <functional>
-#include "_reg_maths.h"
+#include "Maths.hpp"
 #include "Debug.hpp"
 
 using namespace NiftyReg;
diff --git a/reg-lib/cuda/CudaCommon.hpp b/reg-lib/cuda/CudaCommon.hpp
index b5872e56..9bd528c3 100644
--- a/reg-lib/cuda/CudaCommon.hpp
+++ b/reg-lib/cuda/CudaCommon.hpp
@@ -118,5 +118,21 @@ UniqueTextureObjectPtr CreateTextureObject(const DataType *devPtr,
                                            const cudaChannelFormatKind channelFormat,
                                            const unsigned channelCount);
 /* *************************************************************** */
+template<bool is3d>
+__device__ __inline__ int3 IndexToDims(const int index, const int3 dims) {
+    int quot = 0, rem;
+    if constexpr (is3d)
+        Divide(index, dims.x * dims.y, quot, rem);
+    else rem = index;
+    const int z = quot;
+    Divide(rem, dims.x, quot, rem);
+    const int y = quot, x = rem;
+    return { x, y, z };
+}
+/* *************************************************************** */
+__device__ __inline__ int3 IndexToDims(const int index, const int3 dims) {
+    return dims.z > 1 ? IndexToDims<true>(index, dims) : IndexToDims<false>(index, dims);
+}
+/* *************************************************************** */
 } // namespace NiftyReg::Cuda
 /* *************************************************************** */
diff --git a/reg-lib/cuda/CudaGlobalTransformation.cu b/reg-lib/cuda/CudaGlobalTransformation.cu
index a5c0b82f..076f91b2 100644
--- a/reg-lib/cuda/CudaGlobalTransformation.cu
+++ b/reg-lib/cuda/CudaGlobalTransformation.cu
@@ -11,7 +11,6 @@
  */
 
 #include "CudaGlobalTransformation.hpp"
-#include "_reg_common_cuda_kernels.cu"
 
 /* *************************************************************** */
 template<bool is3d, bool compose>
@@ -21,7 +20,7 @@ void GetAffineDeformationField(const mat44 *affineMatrix,
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(deformationField, is3d ? 3 : 2);
     const int3 imageDims = make_int3(deformationField->nx, deformationField->ny, deformationField->nz);
     const mat44 *targetMatrix = deformationField->sform_code > 0 ? &deformationField->sto_xyz : &deformationField->qto_xyz;
-    const mat44 transMatrix = compose ? *affineMatrix : reg_mat44_mul(affineMatrix, targetMatrix);
+    const mat44 transMatrix = compose ? *affineMatrix : *affineMatrix * *targetMatrix;
     Cuda::UniqueTextureObjectPtr deformationFieldTexturePtr; cudaTextureObject_t deformationFieldTexture = 0;
     if constexpr (compose) {
         deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4);
@@ -36,7 +35,7 @@ void GetAffineDeformationField(const mat44 *affineMatrix,
             float4 defVal = tex1Dfetch<float4>(deformationFieldTexture, index);
             voxel[0] = defVal.x; voxel[1] = defVal.y; voxel[2] = defVal.z;
         } else {
-            auto dims = reg_indexToDims_cuda<is3d>(index, imageDims);
+            auto dims = Cuda::IndexToDims<is3d>(index, imageDims);
             voxel[0] = static_cast<float>(dims.x);
             voxel[1] = static_cast<float>(dims.y);
             voxel[2] = static_cast<float>(dims.z);
@@ -44,7 +43,7 @@ void GetAffineDeformationField(const mat44 *affineMatrix,
 
         // The transformation is applied
         float position[3];
-        reg_mat44_mul_cuda<is3d>(transMatrix, voxel, position);
+        Mat44Mul<float, is3d>(transMatrix, voxel, position);
 
         // The deformation field (real coordinates) is stored
         deformationFieldCuda[index] = make_float4(position[0], position[1], position[2], 0);
diff --git a/reg-lib/cuda/CudaLocalTransformation.cu b/reg-lib/cuda/CudaLocalTransformation.cu
index 2c98a8ca..ce733da6 100644
--- a/reg-lib/cuda/CudaLocalTransformation.cu
+++ b/reg-lib/cuda/CudaLocalTransformation.cu
@@ -84,7 +84,7 @@ __device__ SecondDerivative<is3d> GetApproxSecondDerivative(const int index,
                                                             cudaTextureObject_t controlPointTexture,
                                                             const int3 controlPointImageDim,
                                                             const Basis2nd<is3d> basis) {
-    const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, controlPointImageDim);
+    const auto [x, y, z] = IndexToDims<is3d>(index, controlPointImageDim);
     if (!isGradient && (x < 1 || x >= controlPointImageDim.x - 1 ||
                         y < 1 || y >= controlPointImageDim.y - 1 ||
                         (is3d && (z < 1 || z >= controlPointImageDim.z - 1)))) return {};
@@ -206,7 +206,7 @@ void ApproxBendingEnergyGradient(nifti_image *controlPointImage,
     const float approxRatio = bendingEnergyWeight / (float)controlPointNumber;
     thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), controlPointNumber,
                        [controlPointImageDim, basis, secondDerivativesTexture, transGradientCuda, approxRatio]__device__(const int index) {
-        const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, controlPointImageDim);
+        const auto [x, y, z] = IndexToDims<is3d>(index, controlPointImageDim);
         typename SecondDerivative<is3d>::Type gradientValue{};
         if constexpr (is3d) {
             for (int c = z - 1, basInd = 0; c < z + 2; c++) {
@@ -270,7 +270,7 @@ void ComputeApproxJacobianValues(const nifti_image *controlPointImage,
     auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4);
 
     // Need to reorient the Jacobian matrix using the header information - real to voxel conversion
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz);
+    const mat33 reorientation = Mat44ToMat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz);
 
     // The Jacobian matrix is computed for every control point
     if (controlPointImage->nz > 1) {
@@ -306,7 +306,7 @@ void ComputeJacobianValues(const nifti_image *controlPointImage,
     auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4);
 
     // Need to reorient the Jacobian matrix using the header information - real to voxel conversion
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz);
+    const mat33 reorientation = Mat44ToMat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz);
 
     // The Jacobian matrix is computed for every voxel
     if (controlPointImage->nz > 1) {
@@ -398,7 +398,7 @@ void GetJacobianPenaltyTermGradient(const nifti_image *referenceImage,
     }
 
     // Need to disorient the Jacobian matrix using the header information - voxel to real conversion
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk);
 
     const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3);
     const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz);
@@ -511,7 +511,7 @@ double CorrectFolding(const nifti_image *referenceImage,
     }
 
     // Need to disorient the Jacobian matrix using the header information - voxel to real conversion
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk);
 
     const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3);
     const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz);
@@ -555,7 +555,7 @@ void GetDeformationFromDisplacement(nifti_image *image, float4 *imageCuda) {
     const int3 imageDim{ image->nx, image->ny, image->nz };
 
     thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), voxelNumber, [=]__device__(const int index) {
-        const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, imageDim);
+        const auto [x, y, z] = IndexToDims<is3d>(index, imageDim);
 
         const float4 initialPosition{
             float(x) * affineMatrix.m[0][0] + float(y) * affineMatrix.m[0][1] + (is3d ? float(z) * affineMatrix.m[0][2] : 0.f) + affineMatrix.m[0][3],
@@ -866,7 +866,7 @@ double ApproxLinearEnergy(const nifti_image *controlPointGrid,
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(controlPointGrid, 3);
 
     // Matrix to use to convert the gradient from mm to voxel
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointGrid->sform_code > 0 ? &controlPointGrid->sto_ijk : &controlPointGrid->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(controlPointGrid->sform_code > 0 ? &controlPointGrid->sto_ijk : &controlPointGrid->qto_ijk);
 
     // Store the basis values since they are constant as the value is approximated at the control point positions only
     Basis1st<is3d> basis;
@@ -903,7 +903,7 @@ void ApproxLinearEnergyGradient(const nifti_image *controlPointGrid,
     const float approxRatio = weight / static_cast<float>(voxelNumber);
 
     // Matrix to use to convert the gradient from mm to voxel
-    const mat33 reorientation = reg_mat44_to_mat33(controlPointGrid->sform_code > 0 ? &controlPointGrid->sto_ijk : &controlPointGrid->qto_ijk);
+    const mat33 reorientation = Mat44ToMat33(controlPointGrid->sform_code > 0 ? &controlPointGrid->sto_ijk : &controlPointGrid->qto_ijk);
     const mat33 invReorientation = nifti_mat33_inverse(reorientation);
 
     // Store the basis values since they are constant as the value is approximated at the control point positions only
@@ -932,7 +932,7 @@ void ApproxLinearEnergyGradient(const nifti_image *controlPointGrid,
     thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), voxelNumber, [
         transGradCuda, dispMatricesTexture, cppDims, approxRatio, basis, invReorientation
     ]__device__(const int index) {
-        const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, cppDims);
+        const auto [x, y, z] = IndexToDims<is3d>(index, cppDims);
         auto gradVal = transGradCuda[index];
 
         if constexpr (is3d) {
diff --git a/reg-lib/cuda/CudaLocalTransformationKernels.cu b/reg-lib/cuda/CudaLocalTransformationKernels.cu
index b7639f76..ebb95539 100644
--- a/reg-lib/cuda/CudaLocalTransformationKernels.cu
+++ b/reg-lib/cuda/CudaLocalTransformationKernels.cu
@@ -10,11 +10,112 @@
  *
  */
 
-#include "_reg_common_cuda_kernels.cu"
+#include "CudaCommon.hpp"
 
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
 /* *************************************************************** */
+__device__ __inline__ mat33 Mat33Inverse(const mat33 r) {
+    /*  INPUT MATRIX:  */
+    const double r11 = r.m[0][0]; const double r12 = r.m[0][1]; const double r13 = r.m[0][2];  /* [ r11 r12 r13 ] */
+    const double r21 = r.m[1][0]; const double r22 = r.m[1][1]; const double r23 = r.m[1][2];  /* [ r21 r22 r23 ] */
+    const double r31 = r.m[2][0]; const double r32 = r.m[2][1]; const double r33 = r.m[2][2];  /* [ r31 r32 r33 ] */
+
+    double deti = (r11 * r22 * r33 - r11 * r32 * r23 - r21 * r12 * r33 +
+                   r21 * r32 * r13 + r31 * r12 * r23 - r31 * r22 * r13);
+
+    if (deti != 0.0) deti = 1.0 / deti;
+
+    mat33 q;
+    q.m[0][0] = float(deti * (r22 * r33 - r32 * r23));
+    q.m[0][1] = float(deti * (-r12 * r33 + r32 * r13));
+    q.m[0][2] = float(deti * (r12 * r23 - r22 * r13));
+
+    q.m[1][0] = float(deti * (-r21 * r33 + r31 * r23));
+    q.m[1][1] = float(deti * (r11 * r33 - r31 * r13));
+    q.m[1][2] = float(deti * (-r11 * r23 + r21 * r13));
+
+    q.m[2][0] = float(deti * (r21 * r32 - r31 * r22));
+    q.m[2][1] = float(deti * (-r11 * r32 + r31 * r12));
+    q.m[2][2] = float(deti * (r11 * r22 - r21 * r12));
+
+    return q;
+}
+/* *************************************************************** */
+__device__ __inline__ float Mat33Determ(const mat33 r) {
+    /*  INPUT MATRIX:  */
+    const double r11 = r.m[0][0]; const double r12 = r.m[0][1]; const double r13 = r.m[0][2];  /* [ r11 r12 r13 ] */
+    const double r21 = r.m[1][0]; const double r22 = r.m[1][1]; const double r23 = r.m[1][2];  /* [ r21 r22 r23 ] */
+    const double r31 = r.m[2][0]; const double r32 = r.m[2][1]; const double r33 = r.m[2][2];  /* [ r31 r32 r33 ] */
+
+    return float(r11 * r22 * r33 - r11 * r32 * r23 - r21 * r12 * r33 +
+                 r21 * r32 * r13 + r31 * r12 * r23 - r31 * r22 * r13);
+}
+/* *************************************************************** */
+__device__ __inline__ float Mat33RowNorm(const mat33 a) {
+    float r1 = fabs(a.m[0][0]) + fabs(a.m[0][1]) + fabs(a.m[0][2]);
+    const float r2 = fabs(a.m[1][0]) + fabs(a.m[1][1]) + fabs(a.m[1][2]);
+    const float r3 = fabs(a.m[2][0]) + fabs(a.m[2][1]) + fabs(a.m[2][2]);
+    if (r1 < r2) r1 = r2;
+    if (r1 < r3) r1 = r3;
+    return r1;
+}
+/* *************************************************************** */
+__device__ __inline__ float Mat33ColNorm(const mat33 a) {
+    float r1 = fabs(a.m[0][0]) + fabs(a.m[1][0]) + fabs(a.m[2][0]);
+    const float r2 = fabs(a.m[0][1]) + fabs(a.m[1][1]) + fabs(a.m[2][1]);
+    const float r3 = fabs(a.m[0][2]) + fabs(a.m[1][2]) + fabs(a.m[2][2]);
+    if (r1 < r2) r1 = r2;
+    if (r1 < r3) r1 = r3;
+    return r1;
+}
+/* *************************************************************** */
+__device__ __inline__ mat33 Mat33Polar(mat33 x) {
+    // Force matrix to be nonsingular
+    float gam = Mat33Determ(x);
+    while (gam == 0.0) {        // Perturb matrix
+        gam = 0.00001f * (0.001f + Mat33RowNorm(x));
+        x.m[0][0] += gam; x.m[1][1] += gam; x.m[2][2] += gam;
+        gam = Mat33Determ(x);
+    }
+
+    mat33 z;
+    float gmi, dif = 1.0f;
+    int k = 0;
+    while (1) {
+        const mat33 y = Mat33Inverse(x);
+        if (dif > 0.3) {     // Far from convergence
+            const float alp = sqrt(Mat33RowNorm(x) * Mat33ColNorm(x));
+            const float bet = sqrt(Mat33RowNorm(y) * Mat33ColNorm(y));
+            gam = sqrt(bet / alp);
+            gmi = 1.f / gam;
+        } else {
+            gam = gmi = 1.0f;  // Close to convergence
+        }
+        z.m[0][0] = 0.5f * (gam * x.m[0][0] + gmi * y.m[0][0]);
+        z.m[0][1] = 0.5f * (gam * x.m[0][1] + gmi * y.m[1][0]);
+        z.m[0][2] = 0.5f * (gam * x.m[0][2] + gmi * y.m[2][0]);
+        z.m[1][0] = 0.5f * (gam * x.m[1][0] + gmi * y.m[0][1]);
+        z.m[1][1] = 0.5f * (gam * x.m[1][1] + gmi * y.m[1][1]);
+        z.m[1][2] = 0.5f * (gam * x.m[1][2] + gmi * y.m[2][1]);
+        z.m[2][0] = 0.5f * (gam * x.m[2][0] + gmi * y.m[0][2]);
+        z.m[2][1] = 0.5f * (gam * x.m[2][1] + gmi * y.m[1][2]);
+        z.m[2][2] = 0.5f * (gam * x.m[2][2] + gmi * y.m[2][2]);
+
+        dif = (fabs(z.m[0][0] - x.m[0][0]) + fabs(z.m[0][1] - x.m[0][1]) +
+               fabs(z.m[0][2] - x.m[0][2]) + fabs(z.m[1][0] - x.m[1][0]) +
+               fabs(z.m[1][1] - x.m[1][1]) + fabs(z.m[1][2] - x.m[1][2]) +
+               fabs(z.m[2][0] - x.m[2][0]) + fabs(z.m[2][1] - x.m[2][1]) +
+               fabs(z.m[2][2] - x.m[2][2]));
+
+        k = k + 1;
+        if (k > 100 || dif < 3.e-6) break;  // Convergence or exhaustion
+        x = z;
+    }
+
+    return z;
+}
+/* *************************************************************** */
 template<bool bspline>
 __device__ __inline__ void GetBasisSplineValues(const float basis, float *values) {
     const float ff = Square(basis);
@@ -206,7 +307,7 @@ __device__ void GetDeformationField3d(float4 *deformationField,
         nodePre = { Floor(xVoxel), Floor(yVoxel), Floor(zVoxel) };
         basis = { xVoxel - float(nodePre.x--), yVoxel - float(nodePre.y--), zVoxel - float(nodePre.z--) };
     } else { // starting deformation field is blank - !composition
-        const auto [x, y, z] = reg_indexToDims_cuda<true>(index, referenceImageDim);
+        const auto [x, y, z] = IndexToDims<true>(index, referenceImageDim);
         // The "nearest previous" node is determined [0,0,0]
         const float xVoxel = float(x) / controlPointVoxelSpacing.x;
         const float yVoxel = float(y) / controlPointVoxelSpacing.y;
@@ -269,7 +370,7 @@ __device__ void GetDeformationField2d(float4 *deformationField,
         nodePre = { Floor(xVoxel), Floor(yVoxel) };
         basis = { xVoxel - float(nodePre.x--), yVoxel - float(nodePre.y--) };
     } else { // starting deformation field is blank - !composition
-        const auto [x, y, z] = reg_indexToDims_cuda<false>(index, referenceImageDim);
+        const auto [x, y, z] = IndexToDims<false>(index, referenceImageDim);
         // The "nearest previous" node is determined [0,0,0]
         const float xVoxel = float(x) / controlPointVoxelSpacing.x;
         const float yVoxel = float(y) / controlPointVoxelSpacing.y;
@@ -312,7 +413,7 @@ __global__ void GetApproxJacobianValues2d(float *jacobianMatrices,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x, quot, rem);
+        Divide(tid, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         if (0 < x && x < controlPointImageDim.x - 1 && 0 < y && y < controlPointImageDim.y - 1) {
@@ -374,9 +475,9 @@ __global__ void GetApproxJacobianValues3d(float *jacobianMatrices,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
+        Divide(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, controlPointImageDim.x, quot, rem);
+        Divide(rem, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         if (0 < x && x < controlPointImageDim.x - 1 && 0 < y && y < controlPointImageDim.y - 1 && 0 < z && z < controlPointImageDim.z - 1) {
@@ -459,7 +560,7 @@ __global__ void GetJacobianValues2d(float *jacobianMatrices,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < voxelNumber) {
         int quot, rem;
-        reg_div_cuda(tid, referenceImageDim.x, quot, rem);
+        Divide(tid, referenceImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         // the "nearest previous" node is determined [0,0,0]
@@ -528,9 +629,9 @@ __global__ void GetJacobianValues3d(float *jacobianMatrices,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < voxelNumber) {
         int quot, rem;
-        reg_div_cuda(tid, referenceImageDim.x * referenceImageDim.y, quot, rem);
+        Divide(tid, referenceImageDim.x * referenceImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, referenceImageDim.x, quot, rem);
+        Divide(rem, referenceImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         // the "nearest previous" node is determined [0,0,0]
@@ -677,7 +778,7 @@ __global__ void ComputeApproxJacGradient2d(float4 *gradient,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x, quot, rem);
+        Divide(tid, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float2 jacobianGradient{};
@@ -729,9 +830,9 @@ __global__ void ComputeApproxJacGradient3d(float4 *gradient,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
+        Divide(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, controlPointImageDim.x, quot, rem);
+        Divide(rem, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float3 jacobianGradient{};
@@ -787,7 +888,7 @@ __global__ void ComputeJacGradient2d(float4 *gradient,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x, quot, rem);
+        Divide(tid, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float2 jacobianGradient{};
@@ -842,9 +943,9 @@ __global__ void ComputeJacGradient3d(float4 *gradient,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
+        Divide(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, controlPointImageDim.x, quot, rem);
+        Divide(rem, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float3 jacobianGradient{};
@@ -917,9 +1018,9 @@ __global__ void ApproxCorrectFolding3d(float4 *controlPointGrid,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
+        Divide(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, controlPointImageDim.x, quot, rem);
+        Divide(rem, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float3 foldingCorrection{};
@@ -988,9 +1089,9 @@ __global__ void CorrectFolding3d(float4 *controlPointGrid,
     const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (tid < controlPointNumber) {
         int quot, rem;
-        reg_div_cuda(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
+        Divide(tid, controlPointImageDim.x * controlPointImageDim.y, quot, rem);
         const int z = quot;
-        reg_div_cuda(rem, controlPointImageDim.x, quot, rem);
+        Divide(rem, controlPointImageDim.x, quot, rem);
         const int y = quot, x = rem;
 
         float3 foldingCorrection{};
@@ -1144,7 +1245,7 @@ __device__ static mat33 CreateDisplacementMatrix(const int index,
                                                  const int3& cppDims,
                                                  const Basis1st<is3d>& basis,
                                                  const mat33& reorientation) {
-    const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, cppDims);
+    const auto [x, y, z] = IndexToDims<is3d>(index, cppDims);
     if (x < 1 || x >= cppDims.x - 1 || y < 1 || y >= cppDims.y - 1 ||
         (is3d && (z < 1 || z >= cppDims.z - 1))) return {};
 
@@ -1189,10 +1290,10 @@ __device__ static mat33 CreateDisplacementMatrix(const int index,
         }
     }
     // Convert from mm to voxel
-    matrix = reg_mat33_mul_cuda(reorientation, matrix);
+    matrix = reorientation * matrix;
     // Removing the rotation component
-    const mat33 r = reg_mat33_inverse_cuda(reg_mat33_polar_cuda(matrix));
-    matrix = reg_mat33_mul_cuda(r, matrix);
+    const mat33 r = Mat33Inverse(Mat33Polar(matrix));
+    matrix = r * matrix;
     // Convert to displacement
     matrix.m[0][0]--; matrix.m[1][1]--;
     if constexpr (is3d) matrix.m[2][2]--;
diff --git a/reg-lib/cuda/CudaOptimiser.cu b/reg-lib/cuda/CudaOptimiser.cu
index fb662d23..7a4e13a7 100644
--- a/reg-lib/cuda/CudaOptimiser.cu
+++ b/reg-lib/cuda/CudaOptimiser.cu
@@ -1,5 +1,4 @@
 #include "CudaOptimiser.hpp"
-#include "_reg_common_cuda_kernels.cu"
 #include <curand_kernel.h>
 
 /* *************************************************************** */
diff --git a/reg-lib/cuda/CudaResampling.cu b/reg-lib/cuda/CudaResampling.cu
index 5c21bee8..58c33998 100644
--- a/reg-lib/cuda/CudaResampling.cu
+++ b/reg-lib/cuda/CudaResampling.cu
@@ -11,7 +11,6 @@
  */
 
 #include "CudaResampling.hpp"
-#include "_reg_common_cuda_kernels.cu"
 
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
@@ -237,19 +236,19 @@ template<bool is3d>
 static float3 GetRealImageSpacing(const nifti_image *image) {
     float3 spacing{};
     float indexVoxel1[3]{}, indexVoxel2[3], realVoxel1[3], realVoxel2[3];
-    reg_mat44_mul(&image->sto_xyz, indexVoxel1, realVoxel1);
+    Mat44Mul(image->sto_xyz, indexVoxel1, realVoxel1);
 
     indexVoxel2[1] = indexVoxel2[2] = 0; indexVoxel2[0] = 1;
-    reg_mat44_mul(&image->sto_xyz, indexVoxel2, realVoxel2);
+    Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
     spacing.x = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
 
     indexVoxel2[0] = indexVoxel2[2] = 0; indexVoxel2[1] = 1;
-    reg_mat44_mul(&image->sto_xyz, indexVoxel2, realVoxel2);
+    Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
     spacing.y = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
 
     if constexpr (is3d) {
         indexVoxel2[0] = indexVoxel2[1] = 0; indexVoxel2[2] = 1;
-        reg_mat44_mul(&image->sto_xyz, indexVoxel2, realVoxel2);
+        Mat44Mul(image->sto_xyz, indexVoxel2, realVoxel2);
         spacing.z = sqrtf(Square(realVoxel1[0] - realVoxel2[0]) + Square(realVoxel1[1] - realVoxel2[1]) + Square(realVoxel1[2] - realVoxel2[2]));
     }
 
@@ -290,7 +289,7 @@ void ResampleGradient(const nifti_image *floatingImage,
                                                              make_float3(warpedImage->dx, warpedImage->dy, warpedImage->dz);
 
     // Reorientation matrix is assessed in order to remove the rigid component
-    const mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(reg_mat44_to_mat33(&deformationField->sto_xyz)));
+    const mat33 reorient = nifti_mat33_inverse(nifti_mat33_polar(Mat44ToMat33(&deformationField->sto_xyz)));
 
     thrust::for_each_n(thrust::device, maskCuda, activeVoxelNumber, [
         warpedImageCuda, floatingTexture, deformationFieldTexture, floatingMatrix, floatingDims, defFieldDims, realSpacing, reorient, paddingValue
@@ -344,7 +343,7 @@ void ResampleGradient(const nifti_image *floatingImage,
         // Compute the Jacobian matrix
         constexpr float basis[] = { 1.f, 0.f };
         constexpr float deriv[] = { -1.f, 1.f };
-        auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, defFieldDims);
+        auto [x, y, z] = IndexToDims<is3d>(index, defFieldDims);
         mat33 jacMat{};
         for (char c = 0; c < (is3d ? 2 : 1); c++) {
             if constexpr (is3d) {
@@ -407,7 +406,7 @@ void ResampleGradient(const nifti_image *floatingImage,
             }
         }
         // reorient and scale the Jacobian matrix
-        jacMat = reg_mat33_mul_cuda(reorient, jacMat);
+        jacMat = reorient * jacMat;
         jacMat.m[0][0] /= realSpacing.x;
         jacMat.m[0][1] /= realSpacing.y;
         jacMat.m[1][0] /= realSpacing.x;
diff --git a/reg-lib/cuda/CudaTools.cu b/reg-lib/cuda/CudaTools.cu
index 91455a2c..6a3f39dd 100644
--- a/reg-lib/cuda/CudaTools.cu
+++ b/reg-lib/cuda/CudaTools.cu
@@ -41,23 +41,23 @@ void VoxelCentricToNodeCentric(const nifti_image *nodeImage,
     if (nodeImage->num_ext > 0 && nodeImage->ext_list[0].edata) {
         mat44 temp = *(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
         temp = nifti_mat44_inverse(temp);
-        transformation = reg_mat44_mul(&temp, &transformation);
+        transformation = temp * transformation;
     }
     // Millimetre to voxel in the reference image
-    transformation = reg_mat44_mul(voxelImage->sform_code > 0 ? &voxelImage->sto_ijk : &voxelImage->qto_ijk, &transformation);
+    transformation = (voxelImage->sform_code > 0 ? voxelImage->sto_ijk : voxelImage->qto_ijk) * transformation;
 
     // The information has to be reoriented
     // Voxel to millimetre contains the orientation of the image that is used
     // to compute the spatial gradient (floating image)
     mat33 reorientation;
     if (voxelToMillimetre) {
-        reorientation = reg_mat44_to_mat33(voxelToMillimetre);
+        reorientation = Mat44ToMat33(voxelToMillimetre);
         if (nodeImage->num_ext > 0 && nodeImage->ext_list[0].edata) {
-            mat33 temp = reg_mat44_to_mat33(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
+            mat33 temp = Mat44ToMat33(reinterpret_cast<mat44*>(nodeImage->ext_list[0].edata));
             temp = nifti_mat33_inverse(temp);
             reorientation = nifti_mat33_mul(temp, reorientation);
         }
-    } else reg_mat33_eye(&reorientation);
+    } else Mat33Eye(&reorientation);
     // The information has to be weighted
     float ratio[3] = { nodeImage->dx, nodeImage->dy, nodeImage->dz };
     for (int i = 0; i < (is3d ? 3 : 2); ++i) {
diff --git a/reg-lib/cuda/CudaToolsKernels.cu b/reg-lib/cuda/CudaToolsKernels.cu
index 361bbdac..f502ac4f 100644
--- a/reg-lib/cuda/CudaToolsKernels.cu
+++ b/reg-lib/cuda/CudaToolsKernels.cu
@@ -8,7 +8,7 @@
  *  See the LICENSE.txt file in the nifty_reg root folder
  */
 
-#include "_reg_common_cuda_kernels.cu"
+#include "CudaCommon.hpp"
 
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
@@ -23,10 +23,10 @@ __device__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
                                                 const mat33 reorientation,
                                                 const int index) {
     // Calculate the node coordinates
-    const auto [x, y, z] = reg_indexToDims_cuda<is3d>(index, nodeImageDims);
+    const auto [x, y, z] = IndexToDims<is3d>(index, nodeImageDims);
     // Transform into voxel coordinates
     float voxelCoord[3], nodeCoord[3] = { static_cast<float>(x), static_cast<float>(y), static_cast<float>(z) };
-    reg_mat44_mul_cuda<is3d>(transformation, nodeCoord, voxelCoord);
+    Mat44Mul<float, is3d>(transformation, nodeCoord, voxelCoord);
 
     // Linear interpolation
     float basisX[2], basisY[2], basisZ[2], interpolatedValue[3]{};
@@ -64,7 +64,7 @@ __device__ void VoxelCentricToNodeCentricKernel(float4 *nodeImageCuda,
     }
 
     float reorientedValue[3];
-    reg_mat33_mul_cuda<is3d>(reorientation, interpolatedValue, weight, reorientedValue);
+    Mat33Mul<is3d>(reorientation, interpolatedValue, weight, reorientedValue);
     nodeImageCuda[index] = { reorientedValue[0], reorientedValue[1], reorientedValue[2], 0 };
 }
 /* *************************************************************** */
diff --git a/reg-lib/cuda/_reg_common_cuda_kernels.cu b/reg-lib/cuda/_reg_common_cuda_kernels.cu
deleted file mode 100644
index 4206931d..00000000
--- a/reg-lib/cuda/_reg_common_cuda_kernels.cu
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- *  Copyright (c) 2009-2018, University College London
- *  Copyright (c) 2018, NiftyReg Developers.
- *  All rights reserved.
- *  See the LICENSE.txt file in the nifty_reg root folder
- */
-
-#pragma once
-
-/* *************************************************************** */
-template<bool is3d>
-__device__ __inline__ void reg_mat33_mul_cuda(const mat33 mat, const float (&in)[3], const double weight, float (&out)[3]) {
-    out[0] = weight * (mat.m[0][0] * in[0] + mat.m[1][0] * in[1] + mat.m[2][0] * in[2]);
-    out[1] = weight * (mat.m[0][1] * in[0] + mat.m[1][1] * in[1] + mat.m[2][1] * in[2]);
-    if constexpr (is3d)
-        out[2] = weight * (mat.m[0][2] * in[0] + mat.m[1][2] * in[1] + mat.m[2][2] * in[2]);
-}
-/* *************************************************************** */
-template<bool is3d>
-__device__ __inline__ void reg_mat44_mul_cuda(const mat44 mat, const float (&in)[3], float (&out)[3]) {
-    out[0] = double(mat.m[0][0]) * double(in[0]) + double(mat.m[0][1]) * double(in[1]) + double(mat.m[0][2]) * double(in[2]) + double(mat.m[0][3]);
-    out[1] = double(mat.m[1][0]) * double(in[0]) + double(mat.m[1][1]) * double(in[1]) + double(mat.m[1][2]) * double(in[2]) + double(mat.m[1][3]);
-    if constexpr (is3d)
-        out[2] = double(mat.m[2][0]) * double(in[0]) + double(mat.m[2][1]) * double(in[1]) + double(mat.m[2][2]) * double(in[2]) + double(mat.m[2][3]);
-}
-/* *************************************************************** */
-__device__ __inline__ mat33 reg_mat33_mul_cuda(const mat33 a, const mat33 b) {
-    mat33 c;
-    for (int i = 0; i < 3; i++)
-        for (int j = 0; j < 3; j++)
-            c.m[i][j] = a.m[i][0] * b.m[0][j] + a.m[i][1] * b.m[1][j] + a.m[i][2] * b.m[2][j];
-    return c;
-}
-/* *************************************************************** */
-__device__ __inline__ mat33 reg_mat33_inverse_cuda(const mat33 r) {
-    /*  INPUT MATRIX:  */
-    const double r11 = r.m[0][0]; const double r12 = r.m[0][1]; const double r13 = r.m[0][2];  /* [ r11 r12 r13 ] */
-    const double r21 = r.m[1][0]; const double r22 = r.m[1][1]; const double r23 = r.m[1][2];  /* [ r21 r22 r23 ] */
-    const double r31 = r.m[2][0]; const double r32 = r.m[2][1]; const double r33 = r.m[2][2];  /* [ r31 r32 r33 ] */
-
-    double deti = (r11 * r22 * r33 - r11 * r32 * r23 - r21 * r12 * r33 +
-                   r21 * r32 * r13 + r31 * r12 * r23 - r31 * r22 * r13);
-
-    if (deti != 0.0) deti = 1.0 / deti;
-
-    mat33 q;
-    q.m[0][0] = float(deti * (r22 * r33 - r32 * r23));
-    q.m[0][1] = float(deti * (-r12 * r33 + r32 * r13));
-    q.m[0][2] = float(deti * (r12 * r23 - r22 * r13));
-
-    q.m[1][0] = float(deti * (-r21 * r33 + r31 * r23));
-    q.m[1][1] = float(deti * (r11 * r33 - r31 * r13));
-    q.m[1][2] = float(deti * (-r11 * r23 + r21 * r13));
-
-    q.m[2][0] = float(deti * (r21 * r32 - r31 * r22));
-    q.m[2][1] = float(deti * (-r11 * r32 + r31 * r12));
-    q.m[2][2] = float(deti * (r11 * r22 - r21 * r12));
-
-    return q;
-}
-/* *************************************************************** */
-__device__ __inline__ float reg_mat33_determ_cuda(const mat33 r) {
-    /*  INPUT MATRIX:  */
-    const double r11 = r.m[0][0]; const double r12 = r.m[0][1]; const double r13 = r.m[0][2];  /* [ r11 r12 r13 ] */
-    const double r21 = r.m[1][0]; const double r22 = r.m[1][1]; const double r23 = r.m[1][2];  /* [ r21 r22 r23 ] */
-    const double r31 = r.m[2][0]; const double r32 = r.m[2][1]; const double r33 = r.m[2][2];  /* [ r31 r32 r33 ] */
-
-    return float(r11 * r22 * r33 - r11 * r32 * r23 - r21 * r12 * r33 +
-                 r21 * r32 * r13 + r31 * r12 * r23 - r31 * r22 * r13);
-}
-/* *************************************************************** */
-__device__ __inline__ float reg_mat33_rownorm_cuda(const mat33 a) {
-    float r1 = fabs(a.m[0][0]) + fabs(a.m[0][1]) + fabs(a.m[0][2]);
-    const float r2 = fabs(a.m[1][0]) + fabs(a.m[1][1]) + fabs(a.m[1][2]);
-    const float r3 = fabs(a.m[2][0]) + fabs(a.m[2][1]) + fabs(a.m[2][2]);
-    if (r1 < r2) r1 = r2;
-    if (r1 < r3) r1 = r3;
-    return r1;
-}
-/* *************************************************************** */
-__device__ __inline__ float reg_mat33_colnorm_cuda(const mat33 a) {
-    float r1 = fabs(a.m[0][0]) + fabs(a.m[1][0]) + fabs(a.m[2][0]);
-    const float r2 = fabs(a.m[0][1]) + fabs(a.m[1][1]) + fabs(a.m[2][1]);
-    const float r3 = fabs(a.m[0][2]) + fabs(a.m[1][2]) + fabs(a.m[2][2]);
-    if (r1 < r2) r1 = r2;
-    if (r1 < r3) r1 = r3;
-    return r1;
-}
-/* *************************************************************** */
-__device__ __inline__ mat33 reg_mat33_polar_cuda(mat33 x) {
-    // Force matrix to be nonsingular
-    float gam = reg_mat33_determ_cuda(x);
-    while (gam == 0.0) {        // Perturb matrix
-        gam = 0.00001f * (0.001f + reg_mat33_rownorm_cuda(x));
-        x.m[0][0] += gam; x.m[1][1] += gam; x.m[2][2] += gam;
-        gam = reg_mat33_determ_cuda(x);
-    }
-
-    mat33 z;
-    float gmi, dif = 1.0f;
-    int k = 0;
-    while (1) {
-        const mat33 y = reg_mat33_inverse_cuda(x);
-        if (dif > 0.3) {     // Far from convergence
-            const float alp = sqrt(reg_mat33_rownorm_cuda(x) * reg_mat33_colnorm_cuda(x));
-            const float bet = sqrt(reg_mat33_rownorm_cuda(y) * reg_mat33_colnorm_cuda(y));
-            gam = sqrt(bet / alp);
-            gmi = 1.f / gam;
-        } else {
-            gam = gmi = 1.0f;  // Close to convergence
-        }
-        z.m[0][0] = 0.5f * (gam * x.m[0][0] + gmi * y.m[0][0]);
-        z.m[0][1] = 0.5f * (gam * x.m[0][1] + gmi * y.m[1][0]);
-        z.m[0][2] = 0.5f * (gam * x.m[0][2] + gmi * y.m[2][0]);
-        z.m[1][0] = 0.5f * (gam * x.m[1][0] + gmi * y.m[0][1]);
-        z.m[1][1] = 0.5f * (gam * x.m[1][1] + gmi * y.m[1][1]);
-        z.m[1][2] = 0.5f * (gam * x.m[1][2] + gmi * y.m[2][1]);
-        z.m[2][0] = 0.5f * (gam * x.m[2][0] + gmi * y.m[0][2]);
-        z.m[2][1] = 0.5f * (gam * x.m[2][1] + gmi * y.m[1][2]);
-        z.m[2][2] = 0.5f * (gam * x.m[2][2] + gmi * y.m[2][2]);
-
-        dif = (fabs(z.m[0][0] - x.m[0][0]) + fabs(z.m[0][1] - x.m[0][1]) +
-               fabs(z.m[0][2] - x.m[0][2]) + fabs(z.m[1][0] - x.m[1][0]) +
-               fabs(z.m[1][1] - x.m[1][1]) + fabs(z.m[1][2] - x.m[1][2]) +
-               fabs(z.m[2][0] - x.m[2][0]) + fabs(z.m[2][1] - x.m[2][1]) +
-               fabs(z.m[2][2] - x.m[2][2]));
-
-        k = k + 1;
-        if (k > 100 || dif < 3.e-6) break;  // Convergence or exhaustion
-        x = z;
-    }
-
-    return z;
-}
-/* *************************************************************** */
-__device__ __inline__ void reg_div_cuda(const int num, const int denom, int& quot, int& rem) {
-    // This will be optimised by the compiler into a single div instruction
-    quot = num / denom;
-    rem = num % denom;
-}
-/* *************************************************************** */
-template<bool is3d>
-__device__ __inline__ int3 reg_indexToDims_cuda(const int index, const int3 dims) {
-    int quot = 0, rem;
-    if constexpr (is3d)
-        reg_div_cuda(index, dims.x * dims.y, quot, rem);
-    else rem = index;
-    const int z = quot;
-    reg_div_cuda(rem, dims.x, quot, rem);
-    const int y = quot, x = rem;
-    return { x, y, z };
-}
-/* *************************************************************** */
-__device__ __inline__ int3 reg_indexToDims_cuda(const int index, const int3 dims) {
-    return dims.z > 1 ? reg_indexToDims_cuda<true>(index, dims) : reg_indexToDims_cuda<false>(index, dims);
-}
-/* *************************************************************** */
diff --git a/reg-lib/cuda/_reg_nmi_gpu.cu b/reg-lib/cuda/_reg_nmi_gpu.cu
index b117e568..d97b569f 100755
--- a/reg-lib/cuda/_reg_nmi_gpu.cu
+++ b/reg-lib/cuda/_reg_nmi_gpu.cu
@@ -11,7 +11,6 @@
  */
 
 #include "_reg_nmi_gpu.h"
-#include "_reg_common_cuda_kernels.cu"
 
 /* *************************************************************** */
 reg_nmi_gpu::reg_nmi_gpu(): reg_nmi::reg_nmi() {
diff --git a/reg-lib/cuda/affineDeformationKernel.cu b/reg-lib/cuda/affineDeformationKernel.cu
index a9ec43a1..e9bfa38d 100644
--- a/reg-lib/cuda/affineDeformationKernel.cu
+++ b/reg-lib/cuda/affineDeformationKernel.cu
@@ -61,7 +61,7 @@ void launchAffine(mat44 *affineTransformation,
 
    float* trans = (float *)malloc(16 * sizeof(float));
    const mat44 *targetMatrix = (deformationField->sform_code > 0) ? &(deformationField->sto_xyz) : &(deformationField->qto_xyz);
-   mat44 transformationMatrix = compose ? *affineTransformation : reg_mat44_mul(affineTransformation, targetMatrix);
+   mat44 transformationMatrix = compose ? *affineTransformation : *affineTransformation * *targetMatrix;
    mat44ToCptr(transformationMatrix, trans);
    NR_CUDA_SAFE_CALL(cudaMemcpy(trans_d, trans, 16 * sizeof(float), cudaMemcpyHostToDevice));
    free(trans);
diff --git a/reg-lib/cuda/blockMatchingKernel.cu b/reg-lib/cuda/blockMatchingKernel.cu
index f70f277f..55781c4f 100644
--- a/reg-lib/cuda/blockMatchingKernel.cu
+++ b/reg-lib/cuda/blockMatchingKernel.cu
@@ -15,7 +15,7 @@
 #include "_reg_tools.h"
 
 #include <vector>
-#include "_reg_maths.h"
+#include "Maths.hpp"
 
 ////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////
diff --git a/reg-lib/cuda/resampleKernel.cu b/reg-lib/cuda/resampleKernel.cu
index cfbe514f..50bcb91c 100644
--- a/reg-lib/cuda/resampleKernel.cu
+++ b/reg-lib/cuda/resampleKernel.cu
@@ -3,7 +3,7 @@
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include"_reg_resampling.h"
-#include"_reg_maths.h"
+#include"Maths.hpp"
 #include "resampleKernel.h"
 #include "CudaCommon.hpp"
 #include"_reg_tools.h"
@@ -59,18 +59,6 @@ __device__ __inline__ void interpolantCubicSpline(FieldTYPE ratio, FieldTYPE *ba
     basis[3] = (FieldTYPE) ((ratio - (double)1.0) * FF / (double)2.0);
 }
 /* *************************************************************** */
-__device__ __inline__
-void reg_mat44_eye(float *mat) {
-	mat[0 * 4 + 0] = 1.f;
-	mat[0 * 4 + 1] = mat[0 * 4 + 2] = mat[0 * 4 + 3] = 0.f;
-	mat[1 * 4 + 1] = 1.f;
-	mat[1 * 4 + 0] = mat[1 * 4 + 2] = mat[1 * 4 + 3] = 0.f;
-	mat[2 * 4 + 2] = 1.f;
-	mat[2 * 4 + 0] = mat[2 * 4 + 1] = mat[2 * 4 + 3] = 0.f;
-	mat[3 * 4 + 3] = 1.f;
-	mat[3 * 4 + 0] = mat[3 * 4 + 1] = mat[3 * 4 + 2] = 0.f;
-}
-/* *************************************************************** */
 __inline__ __device__ void interpWindowedSincKernel(double relative, double *basis)
 {
 	if (relative < 0)
diff --git a/reg-test/reg_test_affineDeformationField.cpp b/reg-test/reg_test_affineDeformationField.cpp
index 858b541b..dc551cf8 100644
--- a/reg-test/reg_test_affineDeformationField.cpp
+++ b/reg-test/reg_test_affineDeformationField.cpp
@@ -39,7 +39,7 @@ class AffineDeformationFieldTest {
 
         // Identity use case - 2D
         mat44 identity;
-        reg_mat44_eye(&identity);
+        Mat44Eye(&identity);
         // Test order [0,0] [1,0] [0,1] [1,1]
         vector<float3> identityResult2d{ { 0, 0, 0 }, { 1, 0, 0 }, { 0, 1, 0 }, { 1, 1, 0 } };
         testData.emplace_back(TestData(
@@ -63,7 +63,7 @@ class AffineDeformationFieldTest {
 
         // Translation - 2D
         mat44 translation;
-        reg_mat44_eye(&translation);
+        Mat44Eye(&translation);
         translation.m[0][3] = -0.5;
         translation.m[1][3] = 1.5;
         translation.m[2][3] = 0.75;
@@ -109,7 +109,7 @@ class AffineDeformationFieldTest {
         // Full affine - 2D
         // Test order [0,0] [1,0] [0,1] [1,1]
         mat44 affine;
-        reg_mat44_eye(&affine);
+        Mat44Eye(&affine);
         affine.m[0][3] = -0.5;
         affine.m[1][3] = 1.5;
         affine.m[2][3] = 0.75;
diff --git a/reg-test/reg_test_be.cpp b/reg-test/reg_test_be.cpp
index cdc57493..7e8e8611 100644
--- a/reg-test/reg_test_be.cpp
+++ b/reg-test/reg_test_be.cpp
@@ -75,8 +75,8 @@ class BendingEnergyTest {
 
         // Set some scaling transformation in the transformations
         mat44 affine2d, affine3d;
-        reg_mat44_eye(&affine2d);
-        reg_mat44_eye(&affine3d);
+        Mat44Eye(&affine2d);
+        Mat44Eye(&affine3d);
         affine3d.m[0][0] = affine2d.m[0][0] = 0.8f;
         affine3d.m[1][1] = affine2d.m[1][1] = 1.2f;
         affine3d.m[2][2] = 1.1f;
diff --git a/reg-test/reg_test_blockMatching.cpp b/reg-test/reg_test_blockMatching.cpp
index aa66259a..2243ec2d 100644
--- a/reg-test/reg_test_blockMatching.cpp
+++ b/reg-test/reg_test_blockMatching.cpp
@@ -46,7 +46,7 @@ class BMTest {
 
         // Create a translation matrix to apply OFFSET voxels along each axis
         mat44 translationMatrix;
-        reg_mat44_eye(&translationMatrix);
+        Mat44Eye(&translationMatrix);
         translationMatrix.m[0][3] = -OFFSET;
         translationMatrix.m[1][3] = -OFFSET;
         translationMatrix.m[2][3] = -OFFSET;
diff --git a/reg-test/reg_test_regr_lts.cpp b/reg-test/reg_test_regr_lts.cpp
index 0cc60f7f..681a8ffc 100644
--- a/reg-test/reg_test_regr_lts.cpp
+++ b/reg-test/reg_test_regr_lts.cpp
@@ -79,8 +79,8 @@ class LtsTest {
             auto&& [testName, reference, floating, ttype, inlier] = data;
 
             // Create identity transformations
-            unique_ptr<mat44> matCpu{ new mat44 }; reg_mat44_eye(matCpu.get());
-            unique_ptr<mat44> matCuda{ new mat44 }; reg_mat44_eye(matCuda.get());
+            unique_ptr<mat44> matCpu{ new mat44 }; Mat44Eye(matCpu.get());
+            unique_ptr<mat44> matCuda{ new mat44 }; Mat44Eye(matCuda.get());
 
             // Create images
             NiftiImage referenceCpu(reference), referenceCuda(reference);
diff --git a/reg-test/reg_test_voxelCentricToNodeCentric.cpp b/reg-test/reg_test_voxelCentricToNodeCentric.cpp
index 551fe96d..aa42def3 100644
--- a/reg-test/reg_test_voxelCentricToNodeCentric.cpp
+++ b/reg-test/reg_test_voxelCentricToNodeCentric.cpp
@@ -154,19 +154,19 @@ class VoxelCentricToNodeCentricTest {
         if (nodeGrad->num_ext > 0 && nodeGrad->ext_list[0].edata) {
             mat44 temp = *(reinterpret_cast<mat44*>(nodeGrad->ext_list[0].edata));
             temp = nifti_mat44_inverse(temp);
-            transformation = reg_mat44_mul(&temp, &transformation);
+            transformation = temp * transformation;
         }
         // Millimetre to voxel in the reference image
         if (voxelGrad->sform_code > 0)
-            transformation = reg_mat44_mul(&voxelGrad->sto_ijk, &transformation);
-        else transformation = reg_mat44_mul(&voxelGrad->qto_ijk, &transformation);
+            transformation = voxelGrad->sto_ijk * transformation;
+        else transformation = voxelGrad->qto_ijk * transformation;
 
         // The information has to be reoriented
         // Voxel to millimetre contains the orientation of the image that is used
         // to compute the spatial gradient (floating image)
-        mat33 reorientation = reg_mat44_to_mat33(voxelToMillimetre);
+        mat33 reorientation = Mat44ToMat33(voxelToMillimetre);
         if (nodeGrad->num_ext > 0 && nodeGrad->ext_list[0].edata) {
-            mat33 temp = reg_mat44_to_mat33(reinterpret_cast<mat44*>(nodeGrad->ext_list[0].edata));
+            mat33 temp = Mat44ToMat33(reinterpret_cast<mat44*>(nodeGrad->ext_list[0].edata));
             temp = nifti_mat33_inverse(temp);
             reorientation = nifti_mat33_mul(temp, reorientation);
         }
@@ -189,7 +189,7 @@ class VoxelCentricToNodeCentricTest {
                 nodeCoord[1] = static_cast<float>(y);
                 for (int x = 0; x < nodeGrad->nx; x++) {
                     nodeCoord[0] = static_cast<float>(x);
-                    reg_mat44_mul(&transformation, nodeCoord, voxelCoord);
+                    Mat44Mul(transformation, nodeCoord, voxelCoord);
                     // Linear interpolation
                     DataType basisX[2], basisY[2], basisZ[2];
                     const int pre[3] = { Floor(voxelCoord[0]), Floor(voxelCoord[1]), Floor(voxelCoord[2]) };