Introduce a first version of h_item which is basically a copy of nd_item

ahonorat · Jun 3, 2018 · 351789e · 351789e
1 parent 1634b0d
commit 351789e
Show file tree

Hide file tree

Showing 18 changed files with 298 additions and 65 deletions.
diff --git a/include/CL/sycl.hpp b/include/CL/sycl.hpp
@@ -47,6 +47,7 @@
 #include "CL/sycl/exception.hpp"
 #include "CL/sycl/group.hpp"
 #include "CL/sycl/handler.hpp"
+#include "CL/sycl/h_item.hpp"
 #include "CL/sycl/id.hpp"
 #include "CL/sycl/image.hpp"
 #include "CL/sycl/item.hpp"

diff --git a/include/CL/sycl/group.hpp b/include/CL/sycl/group.hpp
@@ -1,7 +1,7 @@
 #ifndef TRISYCL_SYCL_GROUP_HPP
 #define TRISYCL_SYCL_GROUP_HPP
 
-/** \file The OpenCL SYCL nd_item<>
+/** \file The OpenCL SYCL group<>
 
     Ronan at Keryell point FR
 
@@ -13,6 +13,7 @@
 #include <functional>
 
 #include "CL/sycl/detail/linear_id.hpp"
+#include "CL/sycl/h_item.hpp"
 #include "CL/sycl/id.hpp"
 #include "CL/sycl/nd_range.hpp"
 #include "CL/sycl/range.hpp"
@@ -175,28 +176,12 @@ struct group {
 
 
   /** Loop on the work-items inside a work-group
-
-      \todo Add this method in the specification
-  */
-  void parallel_for_work_item(std::function<void(nd_item<dimensionality>)> f)
+   */
+  void parallel_for_work_item(std::function<void(h_item<dimensionality>)> f)
     const {
     detail::parallel_for_workitem(*this, f);
   }
 
-
-  /** Loop on the work-items inside a work-group
-
-      \todo Add this method in the specification
-  */
-  void parallel_for_work_item(std::function<void(item<dimensionality>)> f)
-    const {
-    auto item_adapter = [=] (nd_item<dimensionality> ndi) {
-      item<dimensionality> i = ndi.get_item();
-      f(i);
-    };
-    detail::parallel_for_workitem(*this, item_adapter);
-  }
-
 };
 
 /// @} End the parallelism Doxygen group

diff --git a/include/CL/sycl/h_item.hpp b/include/CL/sycl/h_item.hpp
@@ -0,0 +1,244 @@
+#ifndef TRISYCL_SYCL_H_ITEM_HPP
+#define TRISYCL_SYCL_H_ITEM_HPP
+
+/** \file The OpenCL SYCL h_item<>
+
+    Ronan at Keryell point FR
+
+    This file is distributed under the University of Illinois Open Source
+    License. See LICENSE.TXT for details.
+*/
+
+#include <cstddef>
+
+#include "CL/sycl/access.hpp"
+#include "CL/sycl/detail/linear_id.hpp"
+#include "CL/sycl/detail/unimplemented.hpp"
+#include "CL/sycl/id.hpp"
+#include "CL/sycl/item.hpp"
+#include "CL/sycl/nd_range.hpp"
+#include "CL/sycl/range.hpp"
+
+namespace cl {
+namespace sycl {
+
+/** \addtogroup parallelism Expressing parallelism through kernels
+    @{
+*/
+
+/** \c h_item<int dimensions> identifies an instance of a \c
+    group::parallel_for_work_item function object executing at each
+    point in a local \c range<int dimensions> passed to a \c
+    parallel_for_work_item call or to the corresponding \c
+    parallel_for_work_group call if no range is passed to the \c
+    parallel_for_work_item call. It encapsulates enough information to
+    identify the work-item’s local and global items according to the
+    information given to \c parallel_for_work_group (physical ids) as
+    well as the work-item’s logical local items in the flexible
+    range. All returned items objects are offset-less. Instances of
+    the \c h_item<int dimensions> class are not user-constructible and
+    are passed by the runtime to each instance of the function object.
+*/
+template <int Dimensions = 1>
+struct h_item {
+  /// \todo add this Boost::multi_array or STL concept to the
+  /// specification?
+  static constexpr auto dimensionality = Dimensions;
+
+private:
+
+  id<Dimensions> global_index;
+  /* This is a cached value since it can be computed from global_index and
+     ND_range */
+  id<Dimensions> local_index;
+  nd_range<Dimensions> ND_range;
+
+public:
+
+  /** Create an empty nd_item<> from an nd_range<>
+
+      \todo This is for the triSYCL implementation which is expected to
+      call set_global() and set_local() later. This should be hidden to
+      the user.
+  */
+  h_item(nd_range<Dimensions> ndr) : ND_range { ndr } {}
+
+
+  /** Create a full nd_item
+
+      \todo This is for validation purpose. Hide this to the programmer
+      somehow
+  */
+  h_item(id<Dimensions> global_index,
+          nd_range<Dimensions> ndr) :
+    global_index { global_index },
+    // Compute the local index using the offset and the group size
+    local_index
+      { (global_index - ndr.get_offset())%id<Dimensions> { ndr.get_local() } },
+    ND_range { ndr }
+  {}
+
+
+  /** To be able to copy and assign nd_item, use default constructors too
+
+      \todo Make most of them protected, reserved to implementation
+  */
+  h_item() = default;
+
+
+  /** Return the constituent global id representing the work-item's
+      position in the global iteration space
+  */
+  id<Dimensions> get_global() const { return global_index; }
+
+
+  /** Return the constituent element of the global id representing the
+      work-item's position in the global iteration space in the given
+      dimension
+  */
+  size_t get_global(int dimension) const { return get_global()[dimension]; }
+
+
+  /** Return the flattened id of the current work-item after subtracting
+      the offset
+  */
+  size_t get_global_linear_id() const {
+    return detail::linear_id(get_global_range(), get_global(), get_offset());
+  }
+
+
+  /** Return the constituent local id representing the work-item's
+      position within the current work-group
+  */
+  id<Dimensions> get_local() const { return local_index; }
+
+
+  /** Return the constituent element of the local id representing the
+      work-item's position within the current work-group in the given
+      dimension
+  */
+  size_t get_local(int dimension) const { return get_local()[dimension]; }
+
+
+  /** Return the flattened id of the current work-item within the current
+      work-group
+   */
+  size_t get_local_linear_id() const {
+    return detail::linear_id(get_local_range(), get_local());
+  }
+
+
+  /** Return the constituent group group representing the work-group's
+      position within the overall nd_range
+  */
+  id<Dimensions> get_group() const {
+    /* Convert get_local_range() to an id<> to remove ambiguity into using
+       implicit conversion either from range<> to id<> or the opposite */
+    return get_global()/id<Dimensions> { get_local_range() };
+  }
+
+
+  /** Return the constituent element of the group id representing the
+      work-group;s position within the overall nd_range in the given
+      dimension.
+  */
+  size_t get_group(int dimension) const {
+    return get_group()[dimension];
+  }
+
+
+  /// Return the flattened id of the current work-group
+  size_t get_group_linear_id() const {
+    return detail::linear_id(get_num_groups(), get_group());
+  }
+
+
+  /// Return the number of groups in the nd_range
+  id<Dimensions> get_num_groups() const {
+    return get_nd_range().get_group();
+  }
+
+  /// Return the number of groups for dimension in the nd_range
+  size_t get_num_groups(int dimension) const {
+     return get_num_groups()[dimension];
+  }
+
+
+  /// Return a range<> representing the dimensions of the nd_range<>
+  range<Dimensions> get_global_range() const {
+    return get_nd_range().get_global();
+  }
+
+
+  /// Return a range<> representing the dimensions of the current work-group
+  range<Dimensions> get_local_range() const {
+    return get_nd_range().get_local();
+  }
+
+
+  /** Return an id<> representing the n-dimensional offset provided to the
+      constructor of the nd_range<> and that is added by the runtime to the
+      global-ID of each work-item
+  */
+  id<Dimensions> get_offset() const { return get_nd_range().get_offset(); }
+
+
+  /// Return the nd_range<> of the current execution
+  nd_range<Dimensions> get_nd_range() const { return ND_range; }
+
+
+  /** Allows projection down to an item
+
+      \todo Add to the specification
+  */
+  item<Dimensions> get_item() const {
+    return { get_global_range(), get_global(), get_offset() };
+  }
+
+
+  /** Execute a barrier with memory ordering on the local address space,
+      global address space or both based on the value of flag
+
+      The current work-item will wait at the barrier until all work-items
+      in the current work-group have reached the barrier.
+
+      In addition, the barrier performs a fence operation ensuring that all
+      memory accesses in the specified address space issued before the
+      barrier complete before those issued after the barrier
+  */
+  void barrier(access::fence_space flag =
+               access::fence_space::global_and_local) const {
+#if defined(_OPENMP) && !defined(TRISYCL_NO_BARRIER)
+    /* Use OpenMP barrier in the implementation with 1 OpenMP thread per
+       work-item of the work-group */
+#pragma omp barrier
+#else
+    // \todo To be implemented efficiently otherwise
+    detail::unimplemented();
+#endif
+  }
+
+
+  // For the triSYCL implementation, need to set the local index
+  void set_local(id<Dimensions> Index) { local_index = Index; }
+
+
+  // For the triSYCL implementation, need to set the global index
+  void set_global(id<Dimensions> Index) { global_index = Index; }
+
+};
+
+/// @} End the parallelism Doxygen group
+
+}
+}
+
+/*
+    # Some Emacs stuff:
+    ### Local Variables:
+    ### ispell-local-dictionary: "american"
+    ### eval: (flyspell-prog-mode)
+    ### End:
+*/
+
+#endif // TRISYCL_SYCL_H_ITEM_HPP
diff --git a/include/CL/sycl/parallelism/detail/parallelism.hpp b/include/CL/sycl/parallelism/detail/parallelism.hpp
@@ -315,7 +315,7 @@ void parallel_for_workitem(const group<Dimensions> &g,
 
   range<Dimensions> l_r = g.get_nd_range().get_local();
   auto tot = l_r.get(0);
-  for (int i = 1; i < (int) Dimensions; ++i){
+  for (int i = 1; i < (int) Dimensions; ++i) {
     tot *= l_r.get(i);
   }
 #pragma omp parallel num_threads(tot)
@@ -341,7 +341,7 @@ void parallel_for_workitem(const group<Dimensions> &g,
   }
 #else
   // In a sequential execution there is only one index processed at a time
-  nd_item<Dimensions> index { g.get_nd_range() };
+  h_item<Dimensions> index { g.get_nd_range() };
   // To iterate on the local work-item
   id<Dimensions> local;
 

diff --git a/tests/2014-04-21-HPC-GPU_Meetup/slide_22.cpp b/tests/2014-04-21-HPC-GPU_Meetup/slide_22.cpp
@@ -27,8 +27,8 @@ my_queue.submit([&](handler &cgh)
   cgh.parallel_for_work_group<class hierarchical>(nd_range<>(range<>(size),
                                                              range<>(groupsize)),
                                                   [=](group<> group) {
-                                                    group.parallel_for_work_item([=](nd_item<1> tile) {
-                                                        out_access[tile] = in_access[tile] * 2;
+                                                    group.parallel_for_work_item([=](h_item<1> tile) {
+                                                        out_access[tile.get_global()] = in_access[tile.get_global()] * 2;
                                                       });
                                                   });
  });

diff --git a/tests/accessor/local_accessor_hierarchical_convolution.cpp b/tests/accessor/local_accessor_hierarchical_convolution.cpp
@@ -63,17 +63,19 @@ int test_main(int argc, char *argv[]) {
           N - (conv_kernel.get_count() - 1),
           N - (conv_kernel.get_count() - 1) },
         [=](group<1> g) {
-          g.parallel_for_work_item([&](nd_item<1> i) {
+          g.parallel_for_work_item([&](h_item<1> i) {
               // Cache the convolution kernel in local memory
               if (i.get_local(0) < a_conv_kernel.get_count())
-                a_conv_cache[i.get_local(0)] = a_conv_kernel[i];
+                a_conv_cache[i.get_local(0)] =
+                  a_conv_kernel[i.get_global()];
             });
           // An implicit barrier happens here
-          g.parallel_for_work_item([&](nd_item<1> i) {
-              a_result[i] = 0;
+          g.parallel_for_work_item([&](h_item<1> i) {
+              a_result[i.get_global()] = 0;
               // The convolution
               for (unsigned int j = 0; j < conv_kernel.get_count(); ++j)
-                a_result[i] += a_a[i.get_local(0) + j]*a_conv_cache[j];
+                a_result[i.get_global()] +=
+                  a_a[i.get_local(0) + j]*a_conv_cache[j];
             });
         });
     });

diff --git a/tests/jacobi/include/stencil-fxd.hpp b/tests/jacobi/include/stencil-fxd.hpp
@@ -181,7 +181,7 @@ class operation_fxd2D {
     f(i, j, out) = stencil.template eval<a_f>(in, i, j);
   }
 
-  inline void eval_local(cl::sycl::nd_item<2> it, cl::sycl::accessor<T, 2, cl::sycl::access::mode::write> out, T *local_tab, int glob_max0, int glob_max1) {
+  inline void eval_local(cl::sycl::h_item<2> it, cl::sycl::accessor<T, 2, cl::sycl::access::mode::write> out, T *local_tab, int glob_max0, int glob_max1) {
     int i = it.get_global().get(0);
     int j = it.get_global().get(1);
     if (i >= glob_max0 || j >= glob_max1)
@@ -193,7 +193,7 @@ class operation_fxd2D {
     f(i, j, out) = stencil.template eval_local<local_dim1>(local_tab, i_local, j_local);
   }
 
-  inline void store_local(T * local_tab, cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> in, cl::sycl::nd_item<2> it, cl::sycl::group<2> gr, int glob_max0, int glob_max1) {
+  inline void store_local(T * local_tab, cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> in, cl::sycl::h_item<2> it, cl::sycl::group<2> gr, int glob_max0, int glob_max1) {
     cl::sycl::range<2> l_range = it.get_local_range();
     cl::sycl::id<2> g_ind = gr.get_id(); //it.get_group_id(); error because ambiguous / operator redefinition 
     cl::sycl::id<2> l_ind = it.get_local();
@@ -251,15 +251,15 @@ class operation_fxd2D {
         cl::sycl::accessor<T, 2, cl::sycl::access::mode::read> _aB {*aB, cgh};
         cgh.parallel_for_work_group<class KernelCompute>(nd_range, [=](cl::sycl::group<2> group){
             T * local = new T[local_dim0 * local_dim1];
-            group.parallel_for_work_item([=](cl::sycl::nd_item<2> it){
+            group.parallel_for_work_item([=](cl::sycl::h_item<2> it){
                 //local copy
                 /* group shoudn't be needed, neither global max*/
                 /* static function needed for st use a priori, but static not compatible
                    with dynamic filed as global_max */
                 store_local(local, _aB, it, group, global_max0+d0, global_max1+d1); 
               });
             //synchro
-            group.parallel_for_work_item([=](cl::sycl::nd_item<2> it){
+            group.parallel_for_work_item([=](cl::sycl::h_item<2> it){
                 //computing
                 /*operation_fxd2D<T, B, f, st, aB, bB, a_f, b_f>::*/
                 eval_local(it, _B, local, global_max0, global_max1);