Skip to content

Commit

Permalink
Introduce a first version of h_item which is basically a copy of nd_item
Browse files Browse the repository at this point in the history
  • Loading branch information
keryell committed Jun 3, 2018
1 parent 1634b0d commit 351789e
Show file tree
Hide file tree
Showing 18 changed files with 298 additions and 65 deletions.
1 change: 1 addition & 0 deletions include/CL/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "CL/sycl/exception.hpp"
#include "CL/sycl/group.hpp"
#include "CL/sycl/handler.hpp"
#include "CL/sycl/h_item.hpp"
#include "CL/sycl/id.hpp"
#include "CL/sycl/image.hpp"
#include "CL/sycl/item.hpp"
Expand Down
23 changes: 4 additions & 19 deletions include/CL/sycl/group.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef TRISYCL_SYCL_GROUP_HPP
#define TRISYCL_SYCL_GROUP_HPP

/** \file The OpenCL SYCL nd_item<>
/** \file The OpenCL SYCL group<>
Ronan at Keryell point FR
Expand All @@ -13,6 +13,7 @@
#include <functional>

#include "CL/sycl/detail/linear_id.hpp"
#include "CL/sycl/h_item.hpp"
#include "CL/sycl/id.hpp"
#include "CL/sycl/nd_range.hpp"
#include "CL/sycl/range.hpp"
Expand Down Expand Up @@ -175,28 +176,12 @@ struct group {


/** Loop on the work-items inside a work-group
\todo Add this method in the specification
*/
void parallel_for_work_item(std::function<void(nd_item<dimensionality>)> f)
*/
void parallel_for_work_item(std::function<void(h_item<dimensionality>)> f)
const {
detail::parallel_for_workitem(*this, f);
}


/** Loop on the work-items inside a work-group
\todo Add this method in the specification
*/
void parallel_for_work_item(std::function<void(item<dimensionality>)> f)
const {
auto item_adapter = [=] (nd_item<dimensionality> ndi) {
item<dimensionality> i = ndi.get_item();
f(i);
};
detail::parallel_for_workitem(*this, item_adapter);
}

};

/// @} End the parallelism Doxygen group
Expand Down
244 changes: 244 additions & 0 deletions include/CL/sycl/h_item.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
#ifndef TRISYCL_SYCL_H_ITEM_HPP
#define TRISYCL_SYCL_H_ITEM_HPP

/** \file The OpenCL SYCL h_item<>
Ronan at Keryell point FR
This file is distributed under the University of Illinois Open Source
License. See LICENSE.TXT for details.
*/

#include <cstddef>

#include "CL/sycl/access.hpp"
#include "CL/sycl/detail/linear_id.hpp"
#include "CL/sycl/detail/unimplemented.hpp"
#include "CL/sycl/id.hpp"
#include "CL/sycl/item.hpp"
#include "CL/sycl/nd_range.hpp"
#include "CL/sycl/range.hpp"

namespace cl {
namespace sycl {

/** \addtogroup parallelism Expressing parallelism through kernels
@{
*/

/** \c h_item<int dimensions> identifies an instance of a \c
group::parallel_for_work_item function object executing at each
point in a local \c range<int dimensions> passed to a \c
parallel_for_work_item call or to the corresponding \c
parallel_for_work_group call if no range is passed to the \c
parallel_for_work_item call. It encapsulates enough information to
identify the work-item’s local and global items according to the
information given to \c parallel_for_work_group (physical ids) as
well as the work-item’s logical local items in the flexible
range. All returned items objects are offset-less. Instances of
the \c h_item<int dimensions> class are not user-constructible and
are passed by the runtime to each instance of the function object.
*/
template <int Dimensions = 1>
struct h_item {
/// \todo add this Boost::multi_array or STL concept to the
/// specification?
static constexpr auto dimensionality = Dimensions;

private:

id<Dimensions> global_index;
/* This is a cached value since it can be computed from global_index and
ND_range */
id<Dimensions> local_index;
nd_range<Dimensions> ND_range;

public:

/** Create an empty nd_item<> from an nd_range<>
\todo This is for the triSYCL implementation which is expected to
call set_global() and set_local() later. This should be hidden to
the user.
*/
h_item(nd_range<Dimensions> ndr) : ND_range { ndr } {}


/** Create a full nd_item
\todo This is for validation purpose. Hide this to the programmer
somehow
*/
h_item(id<Dimensions> global_index,
nd_range<Dimensions> ndr) :
global_index { global_index },
// Compute the local index using the offset and the group size
local_index
{ (global_index - ndr.get_offset())%id<Dimensions> { ndr.get_local() } },
ND_range { ndr }
{}


/** To be able to copy and assign nd_item, use default constructors too
\todo Make most of them protected, reserved to implementation
*/
h_item() = default;


/** Return the constituent global id representing the work-item's
position in the global iteration space
*/
id<Dimensions> get_global() const { return global_index; }


/** Return the constituent element of the global id representing the
work-item's position in the global iteration space in the given
dimension
*/
size_t get_global(int dimension) const { return get_global()[dimension]; }


/** Return the flattened id of the current work-item after subtracting
the offset
*/
size_t get_global_linear_id() const {
return detail::linear_id(get_global_range(), get_global(), get_offset());
}


/** Return the constituent local id representing the work-item's
position within the current work-group
*/
id<Dimensions> get_local() const { return local_index; }


/** Return the constituent element of the local id representing the
work-item's position within the current work-group in the given
dimension
*/
size_t get_local(int dimension) const { return get_local()[dimension]; }


/** Return the flattened id of the current work-item within the current
work-group
*/
size_t get_local_linear_id() const {
return detail::linear_id(get_local_range(), get_local());
}


/** Return the constituent group group representing the work-group's
position within the overall nd_range
*/
id<Dimensions> get_group() const {
/* Convert get_local_range() to an id<> to remove ambiguity into using
implicit conversion either from range<> to id<> or the opposite */
return get_global()/id<Dimensions> { get_local_range() };
}


/** Return the constituent element of the group id representing the
work-group;s position within the overall nd_range in the given
dimension.
*/
size_t get_group(int dimension) const {
return get_group()[dimension];
}


/// Return the flattened id of the current work-group
size_t get_group_linear_id() const {
return detail::linear_id(get_num_groups(), get_group());
}


/// Return the number of groups in the nd_range
id<Dimensions> get_num_groups() const {
return get_nd_range().get_group();
}

/// Return the number of groups for dimension in the nd_range
size_t get_num_groups(int dimension) const {
return get_num_groups()[dimension];
}


/// Return a range<> representing the dimensions of the nd_range<>
range<Dimensions> get_global_range() const {
return get_nd_range().get_global();
}


/// Return a range<> representing the dimensions of the current work-group
range<Dimensions> get_local_range() const {
return get_nd_range().get_local();
}


/** Return an id<> representing the n-dimensional offset provided to the
constructor of the nd_range<> and that is added by the runtime to the
global-ID of each work-item
*/
id<Dimensions> get_offset() const { return get_nd_range().get_offset(); }


/// Return the nd_range<> of the current execution
nd_range<Dimensions> get_nd_range() const { return ND_range; }


/** Allows projection down to an item
\todo Add to the specification
*/
item<Dimensions> get_item() const {
return { get_global_range(), get_global(), get_offset() };
}


/** Execute a barrier with memory ordering on the local address space,
global address space or both based on the value of flag
The current work-item will wait at the barrier until all work-items
in the current work-group have reached the barrier.
In addition, the barrier performs a fence operation ensuring that all
memory accesses in the specified address space issued before the
barrier complete before those issued after the barrier
*/
void barrier(access::fence_space flag =
access::fence_space::global_and_local) const {
#if defined(_OPENMP) && !defined(TRISYCL_NO_BARRIER)
/* Use OpenMP barrier in the implementation with 1 OpenMP thread per
work-item of the work-group */
#pragma omp barrier
#else
// \todo To be implemented efficiently otherwise
detail::unimplemented();
#endif
}


// For the triSYCL implementation, need to set the local index
void set_local(id<Dimensions> Index) { local_index = Index; }


// For the triSYCL implementation, need to set the global index
void set_global(id<Dimensions> Index) { global_index = Index; }

};

/// @} End the parallelism Doxygen group

}
}

/*
# Some Emacs stuff:
### Local Variables:
### ispell-local-dictionary: "american"
### eval: (flyspell-prog-mode)
### End:
*/

#endif // TRISYCL_SYCL_H_ITEM_HPP
4 changes: 2 additions & 2 deletions include/CL/sycl/parallelism/detail/parallelism.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ void parallel_for_workitem(const group<Dimensions> &g,

range<Dimensions> l_r = g.get_nd_range().get_local();
auto tot = l_r.get(0);
for (int i = 1; i < (int) Dimensions; ++i){
for (int i = 1; i < (int) Dimensions; ++i) {
tot *= l_r.get(i);
}
#pragma omp parallel num_threads(tot)
Expand All @@ -341,7 +341,7 @@ void parallel_for_workitem(const group<Dimensions> &g,
}
#else
// In a sequential execution there is only one index processed at a time
nd_item<Dimensions> index { g.get_nd_range() };
h_item<Dimensions> index { g.get_nd_range() };
// To iterate on the local work-item
id<Dimensions> local;

Expand Down
4 changes: 2 additions & 2 deletions tests/2014-04-21-HPC-GPU_Meetup/slide_22.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ my_queue.submit([&](handler &cgh)
cgh.parallel_for_work_group<class hierarchical>(nd_range<>(range<>(size),
range<>(groupsize)),
[=](group<> group) {
group.parallel_for_work_item([=](nd_item<1> tile) {
out_access[tile] = in_access[tile] * 2;
group.parallel_for_work_item([=](h_item<1> tile) {
out_access[tile.get_global()] = in_access[tile.get_global()] * 2;
});
});
});
Expand Down
12 changes: 7 additions & 5 deletions tests/accessor/local_accessor_hierarchical_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,19 @@ int test_main(int argc, char *argv[]) {
N - (conv_kernel.get_count() - 1),
N - (conv_kernel.get_count() - 1) },
[=](group<1> g) {
g.parallel_for_work_item([&](nd_item<1> i) {
g.parallel_for_work_item([&](h_item<1> i) {
// Cache the convolution kernel in local memory
if (i.get_local(0) < a_conv_kernel.get_count())
a_conv_cache[i.get_local(0)] = a_conv_kernel[i];
a_conv_cache[i.get_local(0)] =
a_conv_kernel[i.get_global()];
});
// An implicit barrier happens here
g.parallel_for_work_item([&](nd_item<1> i) {
a_result[i] = 0;
g.parallel_for_work_item([&](h_item<1> i) {
a_result[i.get_global()] = 0;
// The convolution
for (unsigned int j = 0; j < conv_kernel.get_count(); ++j)
a_result[i] += a_a[i.get_local(0) + j]*a_conv_cache[j];
a_result[i.get_global()] +=
a_a[i.get_local(0) + j]*a_conv_cache[j];
});
});
});
Expand Down
8 changes: 4 additions & 4 deletions tests/jacobi/include/stencil-fxd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class operation_fxd2D {
f(i, j, out) = stencil.template eval<a_f>(in, i, j);
}

inline void eval_local(cl::sycl::nd_item<2> it, cl::sycl::accessor<T, 2, cl::sycl::access::mode::write> out, T *local_tab, int glob_max0, int glob_max1) {
inline void eval_local(cl::sycl::h_item<2> it, cl::sycl::accessor<T, 2, cl::sycl::access::mode::write> out, T *local_tab, int glob_max0, int glob_max1) {
int i = it.get_global().get(0);
int j = it.get_global().get(1);
if (i >= glob_max0 || j >= glob_max1)
Expand All @@ -193,7 +193,7 @@ class operation_fxd2D {
f(i, j, out) = stencil.template eval_local<local_dim1>(local_tab, i_local, j_local);
}

inline void store_local(T * local_tab, cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> in, cl::sycl::nd_item<2> it, cl::sycl::group<2> gr, int glob_max0, int glob_max1) {
inline void store_local(T * local_tab, cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> in, cl::sycl::h_item<2> it, cl::sycl::group<2> gr, int glob_max0, int glob_max1) {
cl::sycl::range<2> l_range = it.get_local_range();
cl::sycl::id<2> g_ind = gr.get_id(); //it.get_group_id(); error because ambiguous / operator redefinition
cl::sycl::id<2> l_ind = it.get_local();
Expand Down Expand Up @@ -251,15 +251,15 @@ class operation_fxd2D {
cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> _aB {*aB, cgh};
cgh.parallel_for_work_group<class KernelCompute>(nd_range, [=](cl::sycl::group<2> group){
T * local = new T[local_dim0 * local_dim1];
group.parallel_for_work_item([=](cl::sycl::nd_item<2> it){
group.parallel_for_work_item([=](cl::sycl::h_item<2> it){
//local copy
/* group shoudn't be needed, neither global max*/
/* static function needed for st use a priori, but static not compatible
with dynamic filed as global_max */
store_local(local, _aB, it, group, global_max0+d0, global_max1+d1);
});
//synchro
group.parallel_for_work_item([=](cl::sycl::nd_item<2> it){
group.parallel_for_work_item([=](cl::sycl::h_item<2> it){
//computing
/*operation_fxd2D<T, B, f, st, aB, bB, a_f, b_f>::*/
eval_local(it, _B, local, global_max0, global_max1);
Expand Down
Loading

0 comments on commit 351789e

Please sign in to comment.