Skip to content

Commit

Permalink
Make CudaCompute::NormaliseGradient() on a par with CPU #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Oct 31, 2023
1 parent 3db10fa commit 5eb3163
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 92 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
354
355
8 changes: 5 additions & 3 deletions reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimepoint) {
// TODO Fix reg_getImageGradient_gpu to accept activeTimepoint
CudaDefContent& con = dynamic_cast<CudaDefContent&>(this->con);
reg_getImageGradient_gpu(con.DefContent::GetFloating(),
reg_getImageGradient_gpu(con.Content::GetFloating(),
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetWarpedGradientCuda(),
Expand All @@ -139,8 +139,10 @@ double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimi
void CudaCompute::NormaliseGradient(double maxGradLength, bool optimiseX, bool optimiseY, bool optimiseZ) {
if (maxGradLength == 0 || (!optimiseX && !optimiseY && !optimiseZ)) return;
CudaF3dContent& con = dynamic_cast<CudaF3dContent&>(this->con);
const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(con.F3dContent::GetTransformationGradient(), 3);
Cuda::NormaliseGradient(con.GetTransformationGradientCuda(), voxelsPerVolume, static_cast<float>(maxGradLength), optimiseX, optimiseY, optimiseZ);
nifti_image *transGrad = con.F3dContent::GetTransformationGradient();
const size_t voxelsPerVolume = NiftiImage::calcVoxelNumber(transGrad, 3);
if (transGrad->nz <= 1) optimiseZ = false;
Cuda::NormaliseGradient(con.GetTransformationGradientCuda(), voxelsPerVolume, maxGradLength, optimiseX, optimiseY, optimiseZ);
}
/* *************************************************************** */
void CudaCompute::SmoothGradient(float sigma) {
Expand Down
80 changes: 48 additions & 32 deletions reg-lib/cuda/CudaNormaliseGradient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ __global__ static void GetMaximalLengthKernel(float *dists,
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < nVoxels) {
float4 gradValue = tex1Dfetch<float4>(imageTexture, tid);
dists[tid] = sqrtf((optimiseX ? gradValue.x * gradValue.x : 0) +
(optimiseY ? gradValue.y * gradValue.y : 0) +
(optimiseZ ? gradValue.z * gradValue.z : 0));
dists[tid] = sqrtf((optimiseX ? Square(gradValue.x) : 0) +
(optimiseY ? Square(gradValue.y) : 0) +
(optimiseZ ? Square(gradValue.z) : 0));
}
}
/* *************************************************************** */
float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda,
const size_t& nVoxels,
const bool& optimiseX,
const bool& optimiseY,
const bool& optimiseZ) {
const size_t nVoxels,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ) {
// Create a texture object for the imageCuda
auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
Expand All @@ -42,33 +42,49 @@ float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda,
return maxDistance;
}
/* *************************************************************** */
__global__ static void NormaliseGradientKernel(float4 *imageCuda,
const unsigned nVoxels,
const float maxGradLenInv,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ) {
const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < nVoxels) {
float4 grad = imageCuda[tid];
imageCuda[tid] = make_float4(optimiseX ? grad.x * maxGradLenInv : 0,
optimiseY ? grad.y * maxGradLenInv : 0,
optimiseZ ? grad.z * maxGradLenInv : 0,
grad.w);
}
template<bool optimiseX, bool optimiseY, bool optimiseZ>
void NormaliseGradient(float4 *imageCuda, const size_t nVoxels, const double maxGradLengthInv) {
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
auto imageTexture = *imageTexturePtr;
thrust::for_each_n(thrust::device, thrust::make_counting_iterator<unsigned>(0), nVoxels, [=]__device__(const unsigned index) {
const float4& val = tex1Dfetch<float4>(imageTexture, index);
imageCuda[index] = make_float4(optimiseX ? val.x * maxGradLengthInv : 0,
optimiseY ? val.y * maxGradLengthInv : 0,
optimiseZ ? val.z * maxGradLengthInv : 0,
val.w);
});
}
/* *************************************************************** */
template<bool optimiseX, bool optimiseY>
static inline void NormaliseGradient(float4 *imageCuda,
const size_t nVoxels,
const double maxGradLengthInv,
const bool optimiseZ) {
auto normaliseGradient = NormaliseGradient<optimiseX, optimiseY, true>;
if (!optimiseZ) normaliseGradient = NormaliseGradient<optimiseX, optimiseY, false>;
normaliseGradient(imageCuda, nVoxels, maxGradLengthInv);
}
/* *************************************************************** */
template<bool optimiseX>
static inline void NormaliseGradient(float4 *imageCuda,
const size_t nVoxels,
const double maxGradLengthInv,
const bool optimiseY,
const bool optimiseZ) {
auto normaliseGradient = NormaliseGradient<optimiseX, true>;
if (!optimiseY) normaliseGradient = NormaliseGradient<optimiseX, false>;
normaliseGradient(imageCuda, nVoxels, maxGradLengthInv, optimiseZ);
}
/* *************************************************************** */
void NiftyReg::Cuda::NormaliseGradient(float4 *imageCuda,
const size_t& nVoxels,
const float& maxGradLength,
const bool& optimiseX,
const bool& optimiseY,
const bool& optimiseZ) {
const unsigned threads = CudaContext::GetBlockSize()->Arithmetic;
const unsigned blocks = static_cast<unsigned>(Ceil(sqrtf(static_cast<float>(nVoxels) / static_cast<float>(threads))));
const dim3 blockDims(threads, 1, 1);
const dim3 gridDims(blocks, blocks, 1);
NormaliseGradientKernel<<<gridDims, blockDims>>>(imageCuda, static_cast<unsigned>(nVoxels), 1 / maxGradLength, optimiseX, optimiseY, optimiseZ);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
const size_t nVoxels,
const double maxGradLength,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ) {
auto normaliseGradient = ::NormaliseGradient<true>;
if (!optimiseX) normaliseGradient = ::NormaliseGradient<false>;
normaliseGradient(imageCuda, nVoxels, 1.0 / maxGradLength, optimiseY, optimiseZ);
}
/* *************************************************************** */
18 changes: 9 additions & 9 deletions reg-lib/cuda/CudaNormaliseGradient.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ namespace NiftyReg::Cuda {
* @return The maximal value of the gradient image
*/
float GetMaximalLength(const float4 *imageCuda,
const size_t& nVoxels,
const bool& optimiseX,
const bool& optimiseY,
const bool& optimiseZ);
const size_t nVoxels,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ);
/* *************************************************************** */
/**
* @brief Normalise the gradient image
Expand All @@ -29,10 +29,10 @@ float GetMaximalLength(const float4 *imageCuda,
* @param optimiseZ Flag to indicate if the z component of the gradient is optimised
*/
void NormaliseGradient(float4 *imageCuda,
const size_t& nVoxels,
const float& maxGradLength,
const bool& optimiseX,
const bool& optimiseY,
const bool& optimiseZ);
const size_t nVoxels,
const double maxGradLength,
const bool optimiseX,
const bool optimiseY,
const bool optimiseZ);
/* *************************************************************** */
} // namespace NiftyReg::Cuda
93 changes: 49 additions & 44 deletions reg-test/reg_test_normaliseGradient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class NormaliseGradientTest {
protected:
using TestData = std::tuple<std::string, NiftiImage, NiftiImage, NiftiImage>;
using TestCase = std::tuple<shared_ptr<Platform>, unique_ptr<F3dContent>, TestData, bool, bool, bool>;
using TestCase = std::tuple<std::string, double, double, NiftiImage, NiftiImage>;

inline static vector<TestCase> testCases;

Expand All @@ -26,7 +26,7 @@ class NormaliseGradientTest {

// Create a random number generator
std::mt19937 gen(0);
std::uniform_real_distribution<float> distr(0, 1);
std::uniform_real_distribution<float> distr(0, 100);

// Create a reference 2D image
vector<NiftiImage::dim_t> dimFlo{ 4, 4 };
Expand Down Expand Up @@ -92,11 +92,31 @@ class NormaliseGradientTest {
for (int optimiseY = 0; optimiseY < 2; optimiseY++) {
for (int optimiseZ = 0; optimiseZ < 2; optimiseZ++) {
// Make a copy of the test data
auto td = testData;
auto&& [testName, reference, controlPointGrid, testGrad] = td;
// Add content
auto [testName, reference, controlPointGrid, expTransGrad] = testData;
testName += " " + platform->GetName() + " " + (optimiseX ? "X" : "noX") + " " + (optimiseY ? "Y" : "noY") + " " + (optimiseZ ? "Z" : "noZ");
// Create the content
unique_ptr<F3dContent> content{ contentCreator->Create(reference, reference, controlPointGrid) };
testCases.push_back({ platform, std::move(content), std::move(td), optimiseX, optimiseY, optimiseZ });

// Set the transformation gradient image to host the computation
NiftiImage transGrad = content->GetTransformationGradient();
transGrad.copyData(expTransGrad);
transGrad.disown();
content->UpdateTransformationGradient();

// Calculate the maximal length
unique_ptr<Compute> compute{ platform->CreateCompute(*content) };
const double maxLength = compute->GetMaximalLength(optimiseX, optimiseY, optimiseZ);
const double expMaxLength = GetMaximalLength<float>(expTransGrad, optimiseX, optimiseY, optimiseZ);

// Normalise the gradient
compute->NormaliseGradient(expMaxLength, optimiseX, optimiseY, optimiseZ);
NormaliseGradient<float>(expTransGrad, expMaxLength, optimiseX, optimiseY, optimiseZ);

// Get the results
transGrad = NiftiImage(content->GetTransformationGradient(), NiftiImage::Copy::Image);

// Save for testing
testCases.push_back({ testName, maxLength, expMaxLength, std::move(transGrad), std::move(expTransGrad) });
}
}
}
Expand All @@ -105,7 +125,7 @@ class NormaliseGradientTest {
}

template<typename T>
T GetMaximalLength(const nifti_image* transformationGradient, const bool& optimiseX, const bool& optimiseY, const bool& optimiseZ) {
T GetMaximalLength(const nifti_image* transformationGradient, const bool optimiseX, const bool optimiseY, const bool optimiseZ) {
if (!optimiseX && !optimiseY && !optimiseZ) return 0;
const size_t nVoxelsPerVolume = NiftiImage::calcVoxelNumber(transformationGradient, 3);
const T *ptrX = static_cast<T*>(transformationGradient->data);
Expand Down Expand Up @@ -139,34 +159,34 @@ class NormaliseGradientTest {
}

template<typename T>
void NormaliseGradient(nifti_image* transformationGradient, const T& maxGradLength, const bool& optimiseX, const bool& optimiseY, const bool& optimiseZ) {
void NormaliseGradient(nifti_image *transformationGradient, const double maxGradLength, const bool optimiseX, const bool optimiseY, const bool optimiseZ) {
if (maxGradLength == 0 || (!optimiseX && !optimiseY && !optimiseZ)) return;
const size_t nVoxelsPerVolume = NiftiImage::calcVoxelNumber(transformationGradient, 3);
T *ptrX = static_cast<T*>(transformationGradient->data);
T *ptrY = &ptrX[nVoxelsPerVolume];
T *ptrZ = &ptrY[nVoxelsPerVolume];
if (transformationGradient->nz > 1) {
for (size_t i = 0; i < nVoxelsPerVolume; ++i) {
T valX = 0, valY = 0, valZ = 0;
double valX = 0, valY = 0, valZ = 0;
if (optimiseX)
valX = ptrX[i];
if (optimiseY)
valY = ptrY[i];
if (optimiseZ)
valZ = ptrZ[i];
ptrX[i] = valX / maxGradLength;
ptrY[i] = valY / maxGradLength;
ptrZ[i] = valZ / maxGradLength;
ptrX[i] = static_cast<T>(valX / maxGradLength);
ptrY[i] = static_cast<T>(valY / maxGradLength);
ptrZ[i] = static_cast<T>(valZ / maxGradLength);
}
} else {
for (size_t i = 0; i < nVoxelsPerVolume; ++i) {
T valX = 0, valY = 0;
double valX = 0, valY = 0;
if (optimiseX)
valX = ptrX[i];
if (optimiseY)
valY = ptrY[i];
ptrX[i] = valX / maxGradLength;
ptrY[i] = valY / maxGradLength;
ptrX[i] = static_cast<T>(valX / maxGradLength);
ptrY[i] = static_cast<T>(valY / maxGradLength);
}
}
}
Expand All @@ -176,48 +196,33 @@ TEST_CASE_METHOD(NormaliseGradientTest, "Normalise gradient", "[NormaliseGradien
// Loop over all generated test cases
for (auto&& testCase : testCases) {
// Retrieve test information
auto&& [platform, content, testData, optimiseX, optimiseY, optimiseZ] = testCase;
auto&& [testName, reference, controlPointGrid, testGrad] = testData;
const std::string sectionName = testName + " " + platform->GetName() + " " + (optimiseX ? "X" : "noX") + " " + (optimiseY ? "Y" : "noY") + " " + (optimiseZ ? "Z" : "noZ");
auto&& [sectionName, maxLength, expMaxLength, transGrad, expTransGrad] = testCase;

SECTION(sectionName) {
NR_COUT << "\n**************** Section " << sectionName << " ****************" << std::endl;

// Increase the precision for the output
NR_COUT << std::fixed << std::setprecision(10);

// Set the transformation gradient image to host the computation
NiftiImage transGrad = content->GetTransformationGradient();
transGrad.copyData(testGrad);
transGrad.disown();
content->UpdateTransformationGradient();

// Calculate the maximal length
unique_ptr<Compute> compute{ platform->CreateCompute(*content) };
const auto maxLength = static_cast<float>(compute->GetMaximalLength(optimiseX, optimiseY, optimiseZ));
const auto testLength = GetMaximalLength<float>(testGrad, optimiseX, optimiseY, optimiseZ);
// Check the results
REQUIRE(fabs(maxLength - testLength) < EPS);

// Normalise the gradient
compute->NormaliseGradient(maxLength, optimiseX, optimiseY, optimiseZ);
NormaliseGradient<float>(testGrad, testLength, optimiseX, optimiseY, optimiseZ);
NR_COUT << "Maximal Length=" << maxLength << " | Expected=" << expMaxLength << std::endl;
REQUIRE(fabs(maxLength - expMaxLength) == 0);

// Check the results
transGrad = content->GetTransformationGradient();
const auto transGradPtr = transGrad.data();
const auto testGradPtr = testGrad.data();
transGrad.disown();
for (size_t i = 0; i < testGrad.nVoxels(); ++i) {
const auto expTransGradPtr = expTransGrad.data();
for (size_t i = 0; i < expTransGrad.nVoxels(); ++i) {
const float transGradVal = transGradPtr[i];
const float testGradVal = testGradPtr[i];
const float diff = abs(transGradVal - testGradVal);
if (diff > EPS)
NR_COUT << i << " " << transGradVal << " " << testGradVal << std::endl;
REQUIRE(diff < EPS);
const float expTransGradVal = expTransGradPtr[i];
const float diff = abs(transGradVal - expTransGradVal);
if (diff > 0) {
NR_COUT << "[i]=" << i;
NR_COUT << " | diff=" << diff;
NR_COUT << " | Result=" << transGradVal;
NR_COUT << " | Expected=" << expTransGradVal << std::endl;
}
REQUIRE(diff == 0);
}
// Ensure the termination of content before CudaContext
content.reset();
}
}
}
6 changes: 3 additions & 3 deletions reg-test/reg_test_regr_getDeformationField.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class GetDeformationFieldTest {
testName += " "s + platform->GetName() + " Composition="s + std::to_string(composition) + " Bspline="s + std::to_string(bspline);
unique_ptr<F3dContent> content{ contentCreator->Create(reference, reference, controlPointGrid) };
unique_ptr<Compute> compute{ platform->CreateCompute(*content) };
NiftiImage expDefField(content->GetDeformationField(), NiftiImage::Copy::Image);
NiftiImage expDefField(content->Content::GetDeformationField(), NiftiImage::Copy::Image);
// Compute the deformation field
compute->GetDeformationField(composition, bspline);
NiftiImage defField(content->GetDeformationField(), NiftiImage::Copy::Image);
Expand Down Expand Up @@ -556,10 +556,10 @@ TEST_CASE_METHOD(GetDeformationFieldTest, "Regression Deformation Field from B-s

// Check the results
const auto defFieldPtr = defField.data();
const auto defFieldExpPtr = expDefField.data();
const auto expDefFieldPtr = expDefField.data();
for (auto i = 0; i < expDefField.nVoxels(); i++) {
const float defFieldVal = defFieldPtr[i];
const float expDefFieldVal = defFieldExpPtr[i];
const float expDefFieldVal = expDefFieldPtr[i];
const float diff = abs(defFieldVal - expDefFieldVal);
if (diff > 0) {
NR_COUT << "[i]=" << i;
Expand Down

0 comments on commit 5eb3163

Please sign in to comment.