forked from triSYCL/triSYCL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
task.hpp
327 lines (256 loc) · 9.3 KB
/
task.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#ifndef TRISYCL_SYCL_TASK_HPP
#define TRISYCL_SYCL_TASK_HPP
/** \file The concept of task behind the scene
Ronan at Keryell point FR
This file is distributed under the University of Illinois Open Source
License. See LICENSE.TXT for details.
*/
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
#ifdef TRISYCL_OPENCL
#include <boost/compute.hpp>
#endif
#include "CL/sycl/accessor/detail/accessor_base.hpp"
#include "CL/sycl/buffer/detail/buffer_base.hpp"
#include "CL/sycl/detail/debug.hpp"
#include "CL/sycl/kernel.hpp"
#include "CL/sycl/queue/detail/queue.hpp"
namespace cl {
namespace sycl {
namespace detail {
/** The abstraction to represent SYCL tasks executing inside command_group
"enable_shared_from_this" allows to access the shared_ptr behind the
scene.
*/
struct task : public std::enable_shared_from_this<task>,
public detail::debug<task> {
/** List of the buffers used by this task
\todo Use a set to check that some buffers are not used many
times at least on writing
*/
std::vector<std::shared_ptr<detail::buffer_base>> buffers_in_use;
/// The tasks producing the buffers used by this task
std::vector<std::shared_ptr<detail::task>> producer_tasks;
/// Keep track of any prologue to be executed before the kernel
std::vector<std::function<void(void)>> prologues;
/// Keep track of any epilogue to be executed after the kernel
std::vector<std::function<void(void)>> epilogues;
/// Store if the execution ended, to be notified by task_ready
bool execution_ended = false;
/// To signal when this task is ready
std::condition_variable ready;
/// To protect the access to the condition variable
std::mutex ready_mutex;
/** Keep track of the queue used to submission to notify kernel completion
or to run OpenCL kernels on */
std::shared_ptr<detail::queue> owner_queue;
/// The OpenCL-compatible kernel run by this task, if any
std::shared_ptr<detail::kernel> kernel;
/** The accessors indexed by their creation order
This is used to relate a kernel parameter of a kernel generated
by the device compiler to its accessor. */
std::vector<std::weak_ptr<detail::accessor_base>> accessors;
/// Create a task from a submitting queue
task(const std::shared_ptr<detail::queue> &q)
: owner_queue { q } {}
/// Add a new task to the task graph and schedule for execution
void schedule(std::function<void(void)> f) {
/* To keep a copy of the task shared_ptr after the end of the
command group, capture it by copy in the following lambda. This
should be easier in C++17 with move semantics on capture
*/
auto task = shared_from_this();
auto execution = [=] {
// Wait for the required tasks to be ready
task->wait_for_producers();
task->prelude();
TRISYCL_DUMP_T("Execute the kernel");
// Execute the kernel
f();
task->postlude();
// Release the buffers that have been written by this task
task->release_buffers();
// Notify the waiting tasks that we are done
task->notify_consumers();
// Notify the queue we are done
owner_queue->kernel_end();
TRISYCL_DUMP_T("Task thread exit");
};
/* Notify the queue that there is a kernel submitted to the
queue. Do not do it in the task contructor so that we can deal
with command group without kernel and if we put it inside the
thread, the queue may have finished before the thread is
scheduled */
owner_queue->kernel_start();
/* \todo it may be implementable with packaged_task that would
deal with exceptions in kernels
*/
#ifndef TRISYCL_NO_ASYNC
/* If in asynchronous execution mode, execute the functor in a new
thread */
std::thread thread(execution);
TRISYCL_DUMP_T("Task thread started");
/** Detach the thread since it will synchronize by its own means
\todo This is an issue if there is an exception in the kernel
*/
thread.detach();
#else
// Just a synchronous execution otherwise
execution();
#endif
}
/// Wait for the required producer tasks to be ready
void wait_for_producers() {
TRISYCL_DUMP_T("Task " << this << " waits for the producer tasks");
for (auto &t : producer_tasks)
t->wait();
// We can let the producers rest in peace
producer_tasks.clear();
}
/// Release the buffers that have been used by this task
void release_buffers() {
TRISYCL_DUMP_T("Task " << this << " releases the written buffers");
for (auto b: buffers_in_use)
b->release();
buffers_in_use.clear();
}
/// Notify the waiting tasks that we are done
void notify_consumers() {
TRISYCL_DUMP_T("Notify all the task waiting for this task " << this);
{
std::unique_lock<std::mutex> ul { ready_mutex };
execution_ended = true;
}
/* \todo Verify that the memory model with the notify does not
require some fence or atomic */
ready.notify_all();
}
/** Wait for this task to be ready
This is to be called from another thread
*/
void wait() {
TRISYCL_DUMP_T("The task wait for task " << this << " to end");
std::unique_lock<std::mutex> ul { ready_mutex };
ready.wait(ul, [&] { return execution_ended; });
}
/** Register a buffer to this task
This is how the dependency graph is incrementally built.
*/
void add_buffer(std::shared_ptr<detail::buffer_base> &buf,
bool is_write_mode) {
TRISYCL_DUMP_T("Add buffer " << buf << " in task " << this);
/* Keep track of the use of the buffer to notify its release at
the end of the execution */
buffers_in_use.push_back(buf);
// To be sure the buffer does not disappear before the kernel can run
buf->use();
std::shared_ptr<detail::task> latest_producer;
if (is_write_mode) {
/* Set this task as the latest producer of the buffer so that
another kernel may wait on this task */
latest_producer = buf->set_latest_producer(shared_from_this());
}
else
latest_producer = buf->get_latest_producer();
/* If the buffer is to be produced by a task, add the task in the
producer list to wait on it before running the task core
If a buffer is accessed first in write mode and then in read mode,
the task will add itself as a producer and will wait for itself
when calling \c wait_for_producers, we avoid this by checking that
\c latest_producer is not \c this
*/
if (latest_producer && latest_producer != shared_from_this())
producer_tasks.push_back(latest_producer);
}
/// Execute the prologues
void prelude() {
TRISYCL_DUMP_T("task::prelude");
for (const auto &p : prologues)
p();
/* Free the functors that may own an accessor owning a buffer
preventing the command group to complete */
prologues.clear();
}
/// Execute the epilogues
void postlude() {
for (const auto &p : epilogues)
p();
/* Free the functors that may own an accessor owning a buffer
preventing the command group to complete */
epilogues.clear();
}
/// Add a function to the prelude to run before kernel execution
void add_prelude(const std::function<void(void)> &f) {
TRISYCL_DUMP_T("task::add_prelude");
prologues.push_back(f);
}
/// Add a function to the postlude to run after kernel execution
void add_postlude(const std::function<void(void)> &f) {
epilogues.push_back(f);
}
/// Get the queue behind the task to run a kernel on
auto get_queue() {
return owner_queue;
}
/// Set the OpenCL-compatible kernel running this task if any
void set_kernel(const std::shared_ptr<cl::sycl::detail::kernel> &k) {
kernel = k;
}
/** Get the kernel running if any
\todo Specify this error in the spec
*/
cl::sycl::detail::kernel &get_kernel() {
if (!kernel)
throw non_cl_error("Cannot use an OpenCL kernel in this context");
return *kernel;
}
/// Set a kernel argument by address
void set_arg(int arg_index, std::size_t arg_size, const void *scalar_value) {
#ifdef TRISYCL_OPENCL
// Forward to the OpenCL kernel
get_kernel().get_boost_compute().set_arg(arg_index,
arg_size,
scalar_value);
#else
throw non_cl_error("Not compiled with OpenCL support");
#endif
}
/// Set a kernel argument by value
template <typename T>
void set_arg(int arg_index, const T &scalar_value) {
#ifdef TRISYCL_OPENCL
// Forward to the OpenCL kernel
get_kernel().get_boost_compute().set_arg(arg_index, scalar_value);
#else
throw non_cl_error("Not compiled with OpenCL support");
#endif
}
#ifdef TRISYCL_OPENCL
/// Get the Boost.Compute buffer for an accessor of the task
auto get_compute_buffer(std::size_t order) {
return accessors[order].lock()->get_cl_buffer();
}
#endif
/** Register an accessor and return its registration order
I would prefer to name this method \c register but this is a C++
keyword...
*/
std::size_t register_accessor(std::weak_ptr<detail::accessor_base> a) {
accessors.push_back(a);
return accessors.size() - 1;
}
};
}
}
}
/*
# Some Emacs stuff:
### Local Variables:
### ispell-local-dictionary: "american"
### eval: (flyspell-prog-mode)
### End:
*/
#endif // TRISYCL_SYCL_TASK_HPP