You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello, I modified the roi_pool_kernel.cu and made it accept 3D input, but I'm not very familiar with the cuda code, can anyone helps me to check if I made some mistakes? Very appreciate!
My input shape is (B, C, H, W, D), roi shape is (num_of_roi, 7) (the order of roi shape is (label, min_width, min_depth, min_height, max_width, max_depth, max_height))
And the output I expect is: (num_of_roi, C, pool_size, pool_size, pool_size)
(I don't know why GitHub break my code to multiple parts, sorry for the inconvenient)
template global void RoIPoolForward(
const int nthreads,
const T* input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
const T* rois,
T* output,
int* argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width * pooled_height pooled_depth)) % channels;
int n = index / (pooled_width * pooled_height* pooled_depth * channels);
const T* offset_rois = rois + n * 7;
int roi_batch_ind = offset_rois[0];
int roi_start_w = round(offset_rois[1] * (284/62));//spatial_scale); //for spatial need to change
int roi_start_h = round(offset_rois[3] * (266/60));//spatial_scale); //different side need different number
int roi_start_d = round(offset_rois[2] * (316/124));//spatial_scale);
int roi_end_w = round(offset_rois[4] * (284/62));//spatial_scale);
int roi_end_h = round(offset_rois[6] * (266/60));//spatial_scale);
int roi_end_d = round(offset_rois[5] * (316/124));//spatial_scale);
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_depth = max(roi_end_d - roi_start_d + 1, 1);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T bin_size_d = static_cast<T>(roi_depth) / static_cast<T>(pooled_depth);
int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
int dstart = static_cast<int>(floor(static_cast<T>(pd) * bin_size_d));
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
int dend = static_cast<int>(ceil(static_cast<T>(pd + 1) * bin_size_d));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
dstart = min(max(dstart + roi_start_d, 0), depth);
dend = min(max(dend + roi_start_d, 0), depth);
bool is_empty = (hend <= hstart) || (wend <= wstart) || (dend <= dstart);
// Define an empty pooling region to be zero
T maxval = is_empty ? 0 : -FLT_MAX;
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
int maxidx = -1;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width * depth;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for (int d = dstart; d < dend; ++d) {
int input_index = d*width*height + h * width + w; //h*depth*width + w*depth + d; //
if (offset_input[input_index] > maxval) {
maxval = offset_input[input_index];
maxidx = input_index;
}
}
}
}
output[index] = maxval;
argmax_data[index] = maxidx;
}
}
template global void RoIPoolBackward(
const int nthreads,
const T* grad_output,
const int* argmax_data,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
T* grad_input,
const T* rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride,
const int d_stride) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width * pooled_height pooled_depth)) % channels;
int n = index / (pooled_width * pooled_height* pooled_depth * channels);
//int c = (index / pooled_width / pooled_height) % channels;
//int n = index / pooled_width / pooled_height / channels;
std::tuple<torch::Tensor, torch::Tensor> roi_pool_forward3d_cuda(const torch::Tensor& input,
const torch::Tensor& rois,
const float spatial_scale,
const int output_size) {
AT_ASSERTM(input.is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
const int num_rois = rois.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int depth = input.size(4);
const int pooling_width = output_size;
const int pooling_height = output_size;
const int pooling_depth = output_size;
const auto total_size = num_rois * pooling_height * pooling_width * pooling_depth * channels;
auto output = torch::empty(
{num_rois, channels, pooling_height, pooling_width, pooling_depth}, input.options());
auto argmax = torch::zeros(
{num_rois, channels, pooling_height, pooling_width, pooling_depth},
input.options().dtype(torch::kInt));
const dim3 grid(std::min((total_size + 512 - 1) / 512, 4*4096));
const dim3 block(512);
if (output.numel() == 0) {
return std::make_tuple(output, argmax);
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "RoIPool_forward", [&] {
RoIPoolForward<scalar_t><<<grid, block>>>(
total_size,
input.contiguous().data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
depth,
pooling_width,
pooling_height,
pooling_depth,
rois.contiguous().data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
argmax.data_ptr<int>());
});
return std::make_tuple(output, argmax);
}
torch::Tensor roi_pool_backward3d_cuda(const torch::Tensor& grad,
const torch::Tensor& argmax,
const torch::Tensor& input_size,
const torch::Tensor& rois) {
// Check if input tensors are CUDA tensors
AT_ASSERTM(grad.is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(argmax.is_cuda(), "argmax must be a CUDA tensor");
auto input_size_a = input_size.accessor<int,1>();
const int batch_size = input_size_a[0];
const int channels = input_size_a[1];
const int height = input_size_a[2];
const int width = input_size_a[3];
const int depth = input_size_a[4];
const int num_rois = argmax.size(0);
const int pooling_width = argmax.size(2);
const int pooling_height = argmax.size(3);
const int pooling_depth = argmax.size(4);
const auto total_size = num_rois * pooling_height * pooling_width * pooling_depth * channels;
auto grad_input =
torch::zeros({batch_size, channels, width, depth, height}, grad.options());
const dim3 grid(std::min((total_size + 512 - 1) / 512, 4*4096));
const dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
return grad_input;
}
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad.stride(0);
int c_stride = grad.stride(1);
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
int d_stride = grad.stride(4);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "RoIPool_backward", [&] {
RoIPoolBackward<scalar_t><<<grid, block>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
argmax.contiguous().data_ptr<int>(),
channels,
height,
width,
depth,
pooling_width,
pooling_height,
pooling_depth,
grad_input.data_ptr<scalar_t>(),
rois.contiguous().data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride,
d_stride);
});
return grad_input;
}
``
The text was updated successfully, but these errors were encountered:
Hello, I modified the roi_pool_kernel.cu and made it accept 3D input, but I'm not very familiar with the cuda code, can anyone helps me to check if I made some mistakes? Very appreciate!
My input shape is (B, C, H, W, D), roi shape is (num_of_roi, 7) (the order of roi shape is (label, min_width, min_depth, min_height, max_width, max_depth, max_height))
And the output I expect is: (num_of_roi, C, pool_size, pool_size, pool_size)
(I don't know why GitHub break my code to multiple parts, sorry for the inconvenient)
`
#include <torch/extension.h>
#include <THC/THCAtomics.cuh>
#include "cuda_helpers.h"
template
global void RoIPoolForward(
const int nthreads,
const T* input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
const T* rois,
T* output,
int* argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width * pooled_height pooled_depth)) % channels;
int n = index / (pooled_width * pooled_height* pooled_depth * channels);
}
}
template
global void RoIPoolBackward(
const int nthreads,
const T* grad_output,
const int* argmax_data,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
T* grad_input,
const T* rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride,
const int d_stride) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width * pooled_height pooled_depth)) % channels;
int n = index / (pooled_width * pooled_height* pooled_depth * channels);
//int c = (index / pooled_width / pooled_height) % channels;
//int n = index / pooled_width / pooled_height / channels;
}
}
std::tuple<torch::Tensor, torch::Tensor> roi_pool_forward3d_cuda(const torch::Tensor& input,
const torch::Tensor& rois,
const float spatial_scale,
const int output_size) {
AT_ASSERTM(input.is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
}
torch::Tensor roi_pool_backward3d_cuda(const torch::Tensor& grad,
const torch::Tensor& argmax,
const torch::Tensor& input_size,
const torch::Tensor& rois) {
// Check if input tensors are CUDA tensors
AT_ASSERTM(grad.is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(argmax.is_cuda(), "argmax must be a CUDA tensor");
}
``
The text was updated successfully, but these errors were encountered: