Skip to content

Commit

Permalink
t
Browse files Browse the repository at this point in the history
  • Loading branch information
archibate committed Jan 28, 2022
1 parent d4fcdfd commit 9540cd4
Show file tree
Hide file tree
Showing 32 changed files with 34,374 additions and 30 deletions.
1 change: 1 addition & 0 deletions 07/07_stencil/03/rwimage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ void write_image(Image const &a, const char *path) {
} else {
ret = stbi_write_bmp(path, nx, ny, comp, p);
}
free(p);
if (!ret) {
perror(path);
exit(-1);
Expand Down
4 changes: 2 additions & 2 deletions 08/10_stencil/01/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set(CMAKE_BUILD_TYPE Release)

project(hellocuda LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC ../../include)
add_executable(main main.cu stb_image.cpp stb_image_write.cpp)
target_include_directories(main PUBLIC . ../../include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
92 changes: 64 additions & 28 deletions 08/10_stencil/01/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,81 @@
#include <vector>
#include "CudaAllocator.h"
#include "ticktock.h"
#include "stb_image.h"
#include "stb_image_write.h"

template <int blockSize, class T>
__global__ void parallel_transpose(T *out, T const *in, int nx, int ny) {
template <class A>
std::tuple<int, int, int> read_image(A &a, const char *path) {
int nx = 0, ny = 0, comp = 0;
unsigned char *p = stbi_load(path, &nx, &ny, &comp, 0);
if (!p) {
perror(path);
exit(-1);
}
a.resize(nx * ny * comp);
for (int c = 0; c < comp; c++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
a[c * nx * ny + y * nx + x] = (1.f / 255.f) * p[(y * nx + x) * comp + c];
}
}
}
stbi_image_free(p);
return {nx, ny, comp};
}

template <class A>
void write_image(A const &a, int nx, int ny, int comp, const char *path) {
auto p = (unsigned char *)malloc(nx * ny * comp);
for (int c = 0; c < comp; c++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
p[(y * nx + x) * comp + c] = std::max(0.f, std::min(255.f, a[c * nx * ny + y * nx + x] * 255.f));
}
}
}
int ret = 0;
auto pt = strrchr(path, '.');
if (pt && !strcmp(pt, ".png")) {
ret = stbi_write_png(path, nx, ny, comp, p, 0);
} else if (pt && !strcmp(pt, ".jpg")) {
ret = stbi_write_jpg(path, nx, ny, comp, p, 0);
} else {
ret = stbi_write_bmp(path, nx, ny, comp, p);
}
free(p);
if (!ret) {
perror(path);
exit(-1);
}
}

template <int nblur, int blockSize>
__global__ void parallel_xblur(float *out, float const *in, int nx, int ny) {
int x = blockIdx.x * blockSize + threadIdx.x;
int y = blockIdx.y * blockSize + threadIdx.y;
if (x >= nx || y >= ny) return;
__shared__ T tmp[(blockSize + 1) * blockSize];
int rx = blockIdx.y * blockSize + threadIdx.x;
int ry = blockIdx.x * blockSize + threadIdx.y;
tmp[threadIdx.y * (blockSize + 1) + threadIdx.x] = in[ry * nx + rx];
__syncthreads();
out[y * nx + x] = tmp[threadIdx.x * (blockSize + 1) + threadIdx.y];
float sum = 0;
for (int i = 0; i < nblur; i++) {
sum += in[y * nx + std::min(x + i, nx - 1)];
}
out[y * nx + x] = sum / nblur;
}

int main() {
int nx = 1<<14, ny = 1<<14;
std::vector<int, CudaAllocator<int>> in(nx * ny);
std::vector<int, CudaAllocator<int>> out(nx * ny);
std::vector<float, CudaAllocator<float>> in;
std::vector<float, CudaAllocator<float>> out;

for (int i = 0; i < nx * ny; i++) {
in[i] = i;
}
auto [nx, ny, _] = read_image(in, "original.jpg");
out.resize(nx * ny);

TICK(parallel_transpose);
parallel_transpose<32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
TICK(parallel_xblur);
parallel_xblur<32, 32><<<dim3(nx / 32, ny / 32, 1), dim3(32, 32, 1)>>>
(out.data(), in.data(), nx, ny);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(parallel_transpose);

for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
if (out[y * nx + x] != in[x * nx + y]) {
printf("Wrong At x=%d,y=%d: %d != %d\n", x, y,
out[y * nx + x], in[x * nx + y]);
return -1;
}
}
}
TOCK(parallel_xblur);

printf("All Correct!\n");
write_image(out, nx, ny, 1, "/tmp/out.png");
system("display /tmp/out.png &");
return 0;
}
Binary file added 08/10_stencil/01/original.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions 08/10_stencil/01/stb_image.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
Loading

0 comments on commit 9540cd4

Please sign in to comment.