forked from taskflow/taskflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
saxpy.cu
83 lines (65 loc) · 2.2 KB
/
saxpy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#include <taskflow/taskflow.hpp>
// Kernel: saxpy
__global__ void saxpy(int n, float a, float *x, float *y) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) {
y[i] = a*x[i] + y[i];
}
}
// Function: main
int main() {
const unsigned N = 1<<20;
tf::Taskflow taskflow ("saxpy-flow");
tf::Executor executor;
std::vector<float> hx, hy;
float* dx {nullptr};
float* dy {nullptr};
// allocate x
auto allocate_x = taskflow.emplace([&]() {
std::cout << "allocating host x and device x ...\n";
hx.resize(N, 1.0f);
cudaMalloc(&dx, N*sizeof(float));
}).name("allocate_x");
// allocate y
auto allocate_y = taskflow.emplace([&]() {
std::cout << "allocating host y and device y ...\n";
hy.resize(N, 2.0f);
cudaMalloc(&dy, N*sizeof(float));
}).name("allocate_y");
// saxpy
auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
std::cout << "running cudaflow ...\n";
auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
auto kernel = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy)
.name("saxpy");
kernel.succeed(h2d_x, h2d_y)
.precede(d2h_x, d2h_y);
}).name("saxpy");
cudaflow.succeed(allocate_x, allocate_y);
// Add a verification task
auto verifier = taskflow.emplace([&](){
float max_error = 0.0f;
for (size_t i = 0; i < N; i++) {
max_error = std::max(max_error, abs(hx[i]-1.0f));
max_error = std::max(max_error, abs(hy[i]-4.0f));
}
std::cout << "saxpy finished with max error: " << max_error << '\n';
}).succeed(cudaflow).name("verify");
// free memory
auto deallocate_x = taskflow.emplace([&](){
std::cout << "deallocating device x ...\n";
cudaFree(dx);
}).name("deallocate_x");
auto deallocate_y = taskflow.emplace([&](){
std::cout << "deallocating device y ...\n";
cudaFree(dy);
}).name("deallocate_y");
verifier.precede(deallocate_x, deallocate_y);
executor.run(taskflow).wait();
std::cout << "dumping the taskflow ...\n";
taskflow.dump(std::cout);
return 0;
}