From c8061ab21ad2f3d6a0e058a56c98b5bb968acf41 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 8 Jun 2023 13:27:20 +0100
Subject: [PATCH 01/20] mem_host_flags: use size_t for element count (#1755)

More recent GCC versions (e.g. 12.2, 13.1) report that the argument to
`new[]` in the `Init` methods exceeds the maximum object size,
seemingly related to the negative range of the widened `int`.

Use an unsigned type to avoid the warning and propagate the signedness
change to other uses of the `num_elements` member.

Fixes https://github.com/KhronosGroup/OpenCL-CTS/issues/1582

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 .../mem_host_flags/C_host_memory_block.h      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/test_conformance/mem_host_flags/C_host_memory_block.h b/test_conformance/mem_host_flags/C_host_memory_block.h
index 78692d17f6..0784c2c2f8 100644
--- a/test_conformance/mem_host_flags/C_host_memory_block.h
+++ b/test_conformance/mem_host_flags/C_host_memory_block.h
@@ -24,14 +24,14 @@
 
 template <class T> class C_host_memory_block {
 public:
-    int num_elements;
+    size_t num_elements;
     int element_size;
     T *pData;
 
     C_host_memory_block();
     ~C_host_memory_block();
-    void Init(int num_elem, T &value);
-    void Init(int num_elem);
+    void Init(size_t num_elem, T &value);
+    void Init(size_t num_elem);
     void Set_to(T &val);
     void Set_to_zero();
     bool Equal_to(T &val);
@@ -40,7 +40,7 @@ template <class T> class C_host_memory_block {
     bool Equal_rect(C_host_memory_block<T> &another, size_t *host_origin,
                     size_t *region, size_t host_row_pitch,
                     size_t host_slice_pitch);
-    bool Equal(T *pData, int num_elements);
+    bool Equal(T *pData, size_t num_elements);
 
     bool Equal_rect_from_orig(C_host_memory_block<T> &another, size_t *soffset,
                               size_t *region, size_t host_row_pitch,
@@ -63,20 +63,20 @@ template <class T> C_host_memory_block<T>::~C_host_memory_block()
     num_elements = 0;
 }
 
-template <class T> void C_host_memory_block<T>::Init(int num_elem, T &value)
+template <class T> void C_host_memory_block<T>::Init(size_t num_elem, T &value)
 {
     if (pData != NULL) delete[] pData;
     pData = new T[num_elem];
-    for (int i = 0; i < num_elem; i++) pData[i] = value;
+    for (size_t i = 0; i < num_elem; i++) pData[i] = value;
 
     num_elements = num_elem;
 }
 
-template <class T> void C_host_memory_block<T>::Init(int num_elem)
+template <class T> void C_host_memory_block<T>::Init(size_t num_elem)
 {
     if (pData != NULL) delete[] pData;
     pData = new T[num_elem];
-    for (int i = 0; i < num_elem; i++) pData[i] = (T)i;
+    for (size_t i = 0; i < num_elem; i++) pData[i] = (T)i;
 
     num_elements = num_elem;
 }
@@ -88,14 +88,14 @@ template <class T> void C_host_memory_block<T>::Set_to_zero()
 
 template <class T> void C_host_memory_block<T>::Set_to(T &val)
 {
-    for (int i = 0; i < num_elements; i++) pData[i] = val;
+    for (size_t i = 0; i < num_elements; i++) pData[i] = val;
 }
 
 template <class T> bool C_host_memory_block<T>::Equal_to(T &val)
 {
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == val) count++;
     }
@@ -106,9 +106,9 @@ template <class T> bool C_host_memory_block<T>::Equal_to(T &val)
 template <class T>
 bool C_host_memory_block<T>::Equal(C_host_memory_block<T> &another)
 {
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == another.pData[i]) count++;
     }
@@ -117,13 +117,13 @@ bool C_host_memory_block<T>::Equal(C_host_memory_block<T> &another)
 }
 
 template <class T>
-bool C_host_memory_block<T>::Equal(T *pIn_Data, int Innum_elements)
+bool C_host_memory_block<T>::Equal(T *pIn_Data, size_t Innum_elements)
 {
     if (this->num_elements != Innum_elements) return false;
 
-    int count = 0;
+    size_t count = 0;
 
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == pIn_Data[i]) count++;
     }
@@ -134,7 +134,7 @@ bool C_host_memory_block<T>::Equal(T *pIn_Data, int Innum_elements)
 template <class T> size_t C_host_memory_block<T>::Count(T &val)
 {
     size_t count = 0;
-    for (int i = 0; i < num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         if (pData[i] == val) count++;
     }

From 475a37abbfa22a55fe47bf76d5c7904b3a37730a Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Fri, 9 Jun 2023 11:25:20 +0100
Subject: [PATCH 02/20] [NFC] Do not use reserved names for include guards
 (#1737)

Names that begin with an underscore followed by an uppercase letter
are reserved for the C++ implementation.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_common/harness/compat.h                               | 6 +++---
 test_common/harness/crc32.h                                | 4 ++--
 test_conformance/c11_atomics/common.h                      | 6 +++---
 test_conformance/c11_atomics/host_atomics.h                | 6 +++---
 test_conformance/d3d10/harness.h                           | 4 ++--
 .../cl_khr_command_buffer/basic_command_buffer.h           | 6 +++---
 .../mutable_command_basic.h                                | 6 +++---
 .../cl_khr_command_buffer_mutable_dispatch/procs.h         | 6 +++---
 .../cl_khr_command_buffer/command_buffer_test_base.h       | 6 +++---
 test_conformance/extensions/cl_khr_command_buffer/procs.h  | 6 +++---
 .../extensions/cl_khr_external_semaphore/procs.h           | 6 +++---
 .../images/kernel_read_write/test_cl_ext_image_buffer.hpp  | 6 +++---
 .../non_uniform_work_group/TestNonUniformWorkGroup.h       | 7 +++----
 test_conformance/non_uniform_work_group/tools.h            | 6 +++---
 test_conformance/pipes/kernels.h                           | 6 +++---
 test_conformance/relationals/test_comparisons_fp.h         | 6 +++---
 16 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h
index 4053b7ee72..a42f29172d 100644
--- a/test_common/harness/compat.h
+++ b/test_common/harness/compat.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
+#ifndef COMPAT_H_
+#define COMPAT_H_
 
 #if defined(_WIN32) && defined(_MSC_VER)
 #include <Windows.h>
@@ -398,4 +398,4 @@ EXTERN_C int __builtin_clz(unsigned int pattern);
 #define sleep(sec) Sleep((sec)*1000)
 #endif
 
-#endif // _COMPAT_H_
+#endif // COMPAT_H_
diff --git a/test_common/harness/crc32.h b/test_common/harness/crc32.h
index 65ca15eea1..6958701108 100644
--- a/test_common/harness/crc32.h
+++ b/test_common/harness/crc32.h
@@ -15,8 +15,8 @@ Agreement or Khronos Conformance Test Source License Agreement as
 executed between Khronos and the recipient.
 ******************************************************************/
 
-#ifndef _CRC32_H_
-#define _CRC32_H_
+#ifndef CRC32_H_
+#define CRC32_H_
 
 #include <stdint.h>
 #include <stddef.h>
diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index 6c7d0b12bf..37c37e874f 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _COMMON_H_
-#define _COMMON_H_
+#ifndef COMMON_H_
+#define COMMON_H_
 
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
@@ -1567,4 +1567,4 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
     return 0;
 }
 
-#endif //_COMMON_H_
+#endif // COMMON_H_
diff --git a/test_conformance/c11_atomics/host_atomics.h b/test_conformance/c11_atomics/host_atomics.h
index 6c4e783aa1..b865970f44 100644
--- a/test_conformance/c11_atomics/host_atomics.h
+++ b/test_conformance/c11_atomics/host_atomics.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _HOST_ATOMICS_H_
-#define _HOST_ATOMICS_H_
+#ifndef HOST_ATOMICS_H_
+#define HOST_ATOMICS_H_
 
 #include "harness/testHarness.h"
 
@@ -247,4 +247,4 @@ CorrespondingType host_atomic_fetch_max(volatile AtomicType *a, CorrespondingTyp
 bool host_atomic_flag_test_and_set(volatile HOST_ATOMIC_FLAG *a, TExplicitMemoryOrderType order);
 void host_atomic_flag_clear(volatile HOST_ATOMIC_FLAG *a, TExplicitMemoryOrderType order);
 
-#endif //_HOST_ATOMICS_H_
+#endif // HOST_ATOMICS_H_
diff --git a/test_conformance/d3d10/harness.h b/test_conformance/d3d10/harness.h
index 184e52cb5b..afeb4966a8 100644
--- a/test_conformance/d3d10/harness.h
+++ b/test_conformance/d3d10/harness.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _HARNESS_H_
-#define _HARNESS_H_
+#ifndef HARNESS_H_
+#define HARNESS_H_
 
 #define _CRT_SECURE_NO_WARNINGS
 
diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
index b1d36024c0..44f4cc6307 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_BASIC_COMMAND_BUFFER_H
-#define _CL_KHR_BASIC_COMMAND_BUFFER_H
+#ifndef CL_KHR_BASIC_COMMAND_BUFFER_H
+#define CL_KHR_BASIC_COMMAND_BUFFER_H
 
 #include "command_buffer_test_base.h"
 #include "harness/typeWrappers.h"
@@ -99,4 +99,4 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
     return TEST_PASS;
 }
 
-#endif // _CL_KHR_BASIC_COMMAND_BUFFER_H
+#endif // CL_KHR_BASIC_COMMAND_BUFFER_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
index 9056a00d90..966695834b 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_MUTABLE_COMMAND_BASIC_H
-#define _CL_KHR_MUTABLE_COMMAND_BASIC_H
+#ifndef CL_KHR_MUTABLE_COMMAND_BASIC_H
+#define CL_KHR_MUTABLE_COMMAND_BASIC_H
 
 #include "../basic_command_buffer.h"
 #include "../command_buffer_test_base.h"
@@ -104,4 +104,4 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest
     const size_t global_work_size = 4 * sizeof(cl_int);
 };
 
-#endif //_CL_KHR_MUTABLE_COMMAND_BASIC_H
\ No newline at end of file
+#endif // CL_KHR_MUTABLE_COMMAND_BASIC_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
index 08512caef5..4b6dacb699 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
-#define _CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+#ifndef CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
 
 #include <CL/cl.h>
 
@@ -59,4 +59,4 @@ extern int test_mutable_command_info_global_work_size(cl_device_id device,
                                                       cl_context context,
                                                       cl_command_queue queue,
                                                       int num_elements);
-#endif /*_CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H*/
+#endif // CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
index 0fd2e4ec70..48abe25d70 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
-#define _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#ifndef CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#define CL_KHR_COMMAND_BUFFER_TEST_BASE_H
 
 #include <CL/cl_ext.h>
 #include "harness/deviceInfo.h"
@@ -174,4 +174,4 @@ class clCommandBufferWrapper {
     }
 
 
-#endif // _CL_KHR_COMMAND_BUFFER_TEST_BASE_H
+#endif // CL_KHR_COMMAND_BUFFER_TEST_BASE_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h
index 63e004a7b4..53a7d93490 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_COMMAND_BUFFER_PROCS_H
-#define _CL_KHR_COMMAND_BUFFER_PROCS_H
+#ifndef CL_KHR_COMMAND_BUFFER_PROCS_H
+#define CL_KHR_COMMAND_BUFFER_PROCS_H
 
 #include <CL/cl.h>
 
@@ -131,4 +131,4 @@ extern int test_event_info_reference_count(cl_device_id device,
                                            cl_command_queue queue,
                                            int num_elements);
 
-#endif /*_CL_KHR_COMMAND_BUFFER_PROCS_H*/
+#endif // CL_KHR_COMMAND_BUFFER_PROCS_H
diff --git a/test_conformance/extensions/cl_khr_external_semaphore/procs.h b/test_conformance/extensions/cl_khr_external_semaphore/procs.h
index 753c8fe227..7e1c4caf3a 100644
--- a/test_conformance/extensions/cl_khr_external_semaphore/procs.h
+++ b/test_conformance/extensions/cl_khr_external_semaphore/procs.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
-#define _CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
+#ifndef CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
+#define CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
 
 #include <CL/cl.h>
 
@@ -79,4 +79,4 @@ extern int test_external_semaphores_invalid_command(cl_device_id deviceID,
                                                     cl_context context,
                                                     cl_command_queue queue,
                                                     int num_elements);
-#endif /* CL_KHR_EXTERNAL_SEMAPHORE */
+#endif // CL_KHR_EXTERNAL_SEMAPHORE_PROCS_H
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
index c6646330b8..56d15808d9 100644
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
@@ -14,8 +14,8 @@
 // limitations under the License.
 //
 
-#ifndef _TEST_CL_EXT_IMAGE_BUFFER
-#define _TEST_CL_EXT_IMAGE_BUFFER
+#ifndef TEST_CL_EXT_IMAGE_BUFFER
+#define TEST_CL_EXT_IMAGE_BUFFER
 
 #define TEST_IMAGE_SIZE 20
 
@@ -121,4 +121,4 @@ static inline void image_desc_init(cl_image_desc* desc,
     }
 }
 
-#endif /* _TEST_CL_EXT_IMAGE_BUFFER */
\ No newline at end of file
+#endif // TEST_CL_EXT_IMAGE_BUFFER
diff --git a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
index 414d10047c..f584606148 100644
--- a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
+++ b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _TESTNONUNIFORMWORKGROUP_H
-#define _TESTNONUNIFORMWORKGROUP_H
+#ifndef TESTNONUNIFORMWORKGROUP_H
+#define TESTNONUNIFORMWORKGROUP_H
 
 #include "procs.h"
 #include <vector>
@@ -147,5 +147,4 @@ class SubTestExecutor {
   unsigned int _overallCounter;
 };
 
-#endif // _TESTNONUNIFORMWORKGROUP_H
-
+#endif // TESTNONUNIFORMWORKGROUP_H
diff --git a/test_conformance/non_uniform_work_group/tools.h b/test_conformance/non_uniform_work_group/tools.h
index 2e63c3ddeb..ba01fc991b 100644
--- a/test_conformance/non_uniform_work_group/tools.h
+++ b/test_conformance/non_uniform_work_group/tools.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _TOOLS_H
-#define _TOOLS_H
+#ifndef TOOLS_H
+#define TOOLS_H
 
 #include "procs.h"
 #include <vector>
@@ -106,4 +106,4 @@ namespace Error {
   };
 
 }
-#endif // _TOOLS_H
+#endif // TOOLS_H
diff --git a/test_conformance/pipes/kernels.h b/test_conformance/pipes/kernels.h
index a2fb70c059..a897e5e848 100644
--- a/test_conformance/pipes/kernels.h
+++ b/test_conformance/pipes/kernels.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef _KERNELS_H_
-#define _KERNELS_H_
+#ifndef KERNELS_H_
+#define KERNELS_H_
 
 static const char* pipe_readwrite_struct_kernel_code = {
     "typedef struct{\n"
@@ -127,4 +127,4 @@ static const char* pipe_convenience_readwrite_struct_kernel_code = {
     "    read_pipe(in_pipe, &dst[gid]);\n"
     "}\n" };
 
-#endif //_KERNELS_H_
+#endif // KERNELS_H_
diff --git a/test_conformance/relationals/test_comparisons_fp.h b/test_conformance/relationals/test_comparisons_fp.h
index 66c62c2d13..3401163ea0 100644
--- a/test_conformance/relationals/test_comparisons_fp.h
+++ b/test_conformance/relationals/test_comparisons_fp.h
@@ -14,8 +14,8 @@
 // limitations under the License.
 //
 
-#ifndef _TEST_COMPARISONS_FP_H
-#define _TEST_COMPARISONS_FP_H
+#ifndef TEST_COMPARISONS_FP_H
+#define TEST_COMPARISONS_FP_H
 
 #include <map>
 #include <memory>
@@ -225,4 +225,4 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
     return TEST_PASS;
 }
 
-#endif // _TEST_COMPARISONS_FP_H
+#endif // TEST_COMPARISONS_FP_H

From 095091bc5755fb3a239f049a6a8ade1d82169fc6 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 13 Jun 2023 08:39:22 +0200
Subject: [PATCH 03/20] Added cl_khr_fp16 extension support for
 test_vec_type_hint from basic (#1724)

* Added cl_khr_fp16 extension support for test_vec_type_hint from basic (issue #142, basic)

* Added correction to fix casting problem
---
 test_conformance/basic/test_vec_type_hint.cpp | 152 ++++++++++--------
 1 file changed, 85 insertions(+), 67 deletions(-)

diff --git a/test_conformance/basic/test_vec_type_hint.cpp b/test_conformance/basic/test_vec_type_hint.cpp
index 33168b1369..0ba105db63 100644
--- a/test_conformance/basic/test_vec_type_hint.cpp
+++ b/test_conformance/basic/test_vec_type_hint.cpp
@@ -13,28 +13,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
-
 static const char *sample_kernel = {
-  "%s\n" // optional pragma string
-  "__kernel __attribute__((vec_type_hint(%s%s))) void sample_test(__global int *src, __global int *dst)\n"
-  "{\n"
-  "    int  tid = get_global_id(0);\n"
-  "     dst[tid] = src[tid];\n"
-  "\n"
-  "}\n"
+    "%s\n"
+    "__kernel __attribute__((vec_type_hint(%s%s))) void sample_test(__global "
+    "int *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     dst[tid] = src[tid];\n"
+    "\n"
+    "}\n"
 };
 
 int test_vec_type_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
@@ -42,66 +41,85 @@ int test_vec_type_hint(cl_device_id deviceID, cl_context context, cl_command_que
   int error;
   int vec_type_index, vec_size_index;
 
-  ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
-    const char *size_names[] = {"", "2", "4", "8", "16"};
-    char *program_source;
-
-  program_source = (char*)malloc(sizeof(char)*4096);
+  ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt,   kUInt,
+                             kLong, kULong, kFloat, kHalf,   kDouble };
+  const char *size_names[] = { "", "2", "4", "8", "16" };
+  std::vector<char> program_source(4096);
+
+  for (vec_type_index = 0;
+       vec_type_index < sizeof(vecType) / sizeof(vecType[0]); vec_type_index++)
+  {
+
+      if (vecType[vec_type_index] == kHalf
+          && !is_extension_available(deviceID, "cl_khr_fp16"))
+      {
+          log_info(
+              "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+          continue;
+      }
+      else if (vecType[vec_type_index] == kDouble
+               && !is_extension_available(deviceID, "cl_khr_fp64"))
+      {
+          log_info(
+              "Extension cl_khr_fp64 not supported; skipping double tests.\n");
+          continue;
+      }
+      else if ((vecType[vec_type_index] == kLong
+                || vecType[vec_type_index] == kULong)
+               && !gHasLong)
+      {
+          log_info(
+              "Extension cl_khr_int64 not supported; skipping long tests.\n");
+          continue;
+      }
 
-  for (vec_type_index=0; vec_type_index<10; vec_type_index++) {
-    if (vecType[vec_type_index] == kDouble) {
-      if (!is_extension_available(deviceID, "cl_khr_fp64")) {
-        log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
-        continue;
+      for (vec_size_index = 0; vec_size_index < 5; vec_size_index++)
+      {
+          clProgramWrapper program;
+          clKernelWrapper kernel;
+          clMemWrapper in, out;
+          size_t global[] = { 1, 1, 1 };
+
+          log_info("Testing __attribute__((vec_type_hint(%s%s))...\n",
+                   get_explicit_type_name(vecType[vec_type_index]),
+                   size_names[vec_size_index]);
+          char extension[128] = { 0 };
+          if (vecType[vec_type_index] == kDouble)
+              std::snprintf(extension, sizeof(extension),
+                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
+          else if (vecType[vec_type_index] == kHalf)
+              std::snprintf(extension, sizeof(extension),
+                            "#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
+
+          sprintf(program_source.data(), sample_kernel, extension,
+                  get_explicit_type_name(vecType[vec_type_index]),
+                  size_names[vec_size_index]);
+
+          const char *src = &program_source.front();
+          error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                              &src, "sample_test");
+          test_error(error, "create_single_kernel_helper failed");
+
+          in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * 10,
+                              NULL, &error);
+          test_error(error, "clCreateBuffer failed");
+          out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int) * 10,
+                               NULL, &error);
+          test_error(error, "clCreateBuffer failed");
+
+          error = clSetKernelArg(kernel, 0, sizeof(in), &in);
+          test_error(error, "clSetKernelArg failed");
+          error = clSetKernelArg(kernel, 1, sizeof(out), &out);
+          test_error(error, "clSetKernelArg failed");
+
+          error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL,
+                                         0, NULL, NULL);
+          test_error(error, "clEnqueueNDRangeKernel failed");
+
+          error = clFinish(queue);
+          test_error(error, "clFinish failed");
       }
-      log_info("Testing doubles.\n");
-    }
-
-    if (vecType[vec_type_index] == kLong || vecType[vec_type_index] == kULong)
-    {
-        if (!gHasLong)
-        {
-            log_info("Extension cl_khr_int64 not supported; skipping long tests.\n");
-            continue;
-        }
-    }
-
-    for (vec_size_index=0; vec_size_index<5; vec_size_index++) {
-      clProgramWrapper program;
-      clKernelWrapper kernel;
-      clMemWrapper in, out;
-      size_t global[] = {1,1,1};
-
-      log_info("Testing __attribute__((vec_type_hint(%s%s))...\n", get_explicit_type_name(vecType[vec_type_index]), size_names[vec_size_index]);
-
-      program_source[0] = '\0';
-      sprintf(program_source, sample_kernel,
-              (vecType[vec_type_index] == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-              get_explicit_type_name(vecType[vec_type_index]), size_names[vec_size_index]);
-
-      error = create_single_kernel_helper( context, &program, &kernel, 1, (const char**)&program_source, "sample_test" );
-      if( error != 0 )
-        return error;
-
-      in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int)*10, NULL, &error);
-      test_error(error, "clCreateBuffer failed");
-      out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*10, NULL, &error);
-      test_error(error, "clCreateBuffer failed");
-
-      error = clSetKernelArg(kernel, 0, sizeof(in), &in);
-      test_error(error, "clSetKernelArg failed");
-      error = clSetKernelArg(kernel, 1, sizeof(out), &out);
-      test_error(error, "clSetKernelArg failed");
-
-      error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
-      test_error(error, "clEnqueueNDRangeKernel failed");
-
-      error = clFinish(queue);
-      test_error(error, "clFinish failed");
-    }
   }
 
-  free(program_source);
-
   return 0;
 }

From 16a75dc0af2e0c55d27a91ffefd0aa1b97b3f484 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 13 Jun 2023 17:41:39 +0200
Subject: [PATCH 04/20] Added cl_khr_fp16 extension support for
 test_vector_creation from basic (#1728)

* Added cl_khr_fp16 extension support for vector_creation test from basic

* Added corrections related to vendor's review

* Added protection to avoid similar creation cases

* Added comment for recent correction

* cosmetics

* Corrected factor array to restore lost capacity of original test..

leaving only 16-sizes vector tests limited.
---
 .../basic/test_vector_creation.cpp            | 489 +++++++++++-------
 1 file changed, 294 insertions(+), 195 deletions(-)

diff --git a/test_conformance/basic/test_vector_creation.cpp b/test_conformance/basic/test_vector_creation.cpp
index d9530b4e9b..801c72b18b 100644
--- a/test_conformance/basic/test_vector_creation.cpp
+++ b/test_conformance/basic/test_vector_creation.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,48 +17,41 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
+#include <vector>
 
-
-
+#include <CL/cl_half.h>
 
 #define DEBUG 0
 #define DEPTH 16
 // Limit the maximum code size for any given kernel.
-#define MAX_CODE_SIZE (1024*32)
-
-const int sizes[] = {1, 2, 3, 4, 8, 16, -1, -1, -1, -1};
-const char *size_names[] = {"", "2", "3", "4", "8", "16" , "!!a", "!!b", "!!c", "!!d"};
-
-// Creates a kernel by enumerating all possible ways of building the vector out of vloads
-// skip_to_results will skip results up to a given number. If the amount of code generated
-// is greater than MAX_CODE_SIZE, this function will return the number of results used,
-// which can then be used as the skip_to_result value to continue where it left off.
-int create_kernel(ExplicitType type, int output_size, char *program, int *number_of_results, int skip_to_result) {
+#define MAX_CODE_SIZE (1024 * 32)
+
+static const int sizes[] = { 1, 2, 3, 4, 8, 16, -1, -1, -1, -1 };
+static const int initial_no_sizes[] = { 0, 0, 0, 0, 0, 0, 2 };
+static const char *size_names[] = { "",   "2",   "3",   "4",   "8",
+                                    "16", "!!a", "!!b", "!!c", "!!d" };
+static char extension[128] = { 0 };
+
+// Creates a kernel by enumerating all possible ways of building the vector out
+// of vloads skip_to_results will skip results up to a given number. If the
+// amount of code generated is greater than MAX_CODE_SIZE, this function will
+// return the number of results used, which can then be used as the
+// skip_to_result value to continue where it left off.
+int create_kernel(ExplicitType type, int output_size, char *program,
+                  int *number_of_results, int skip_to_result)
+{
 
     int number_of_sizes;
 
-    switch (output_size) {
-        case 1:
-            number_of_sizes = 1;
-            break;
-        case 2:
-            number_of_sizes = 2;
-            break;
-        case 3:
-            number_of_sizes = 3;
-            break;
-        case 4:
-            number_of_sizes = 4;
-            break;
-        case 8:
-            number_of_sizes = 5;
-            break;
-        case 16:
-            number_of_sizes = 6;
-            break;
-        default:
-            log_error("Invalid size: %d\n", output_size);
-            return -1;
+    switch (output_size)
+    {
+        case 1: number_of_sizes = 1; break;
+        case 2: number_of_sizes = 2; break;
+        case 3: number_of_sizes = 3; break;
+        case 4: number_of_sizes = 4; break;
+        case 8: number_of_sizes = 5; break;
+        case 16: number_of_sizes = 6; break;
+        default: log_error("Invalid size: %d\n", output_size); return -1;
     }
 
     int total_results = 0;
@@ -67,102 +60,125 @@ int create_kernel(ExplicitType type, int output_size, char *program, int *number
     int total_program_length = 0;
     int aborted_due_to_size = 0;
 
-    if (skip_to_result < 0)
-        skip_to_result = 0;
+    if (skip_to_result < 0) skip_to_result = 0;
 
     // The line of code for the vector creation
     char line[1024];
-    // Keep track of what size vector we are using in each position so we can iterate through all fo them
+    // Keep track of what size vector we are using in each position so we can
+    // iterate through all fo them
     int pos[DEPTH];
     int max_size = output_size;
     if (DEBUG > 1) log_info("max_size: %d\n", max_size);
 
     program[0] = '\0';
-    sprintf(program, "%s\n__kernel void test_vector_creation(__global %s *src, __global %s%s *result) {\n",
-            type == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
-            get_explicit_type_name(type), get_explicit_type_name(type), ( number_of_sizes == 3 ) ? "" : size_names[number_of_sizes-1]);
+    sprintf(program,
+            "%s\n__kernel void test_vector_creation(__global %s *src, __global "
+            "%s%s *result) {\n",
+            extension, get_explicit_type_name(type),
+            get_explicit_type_name(type),
+            (number_of_sizes == 3) ? "" : size_names[number_of_sizes - 1]);
     total_program_length += (int)strlen(program);
 
-    char storePrefix[ 128 ], storeSuffix[ 128 ];
+    char storePrefix[128], storeSuffix[128];
 
-    // Start out trying sizes 1,1,1,1,1...
-    for (int i=0; i<DEPTH; i++)
-        pos[i] = 0;
+    // Start out trying sizes 1,1,1... by initializing pos array to zeros for
+    // all vector sizes except 16. For 16-sizes initial_no_sizes array holds
+    // factor to omit time consuming, similar creation cases tested earlier.
+    for (int i = 0; i < DEPTH; i++) pos[i] = initial_no_sizes[number_of_sizes];
 
     int done = 0;
-    while (!done) {
-        if (DEBUG > 1) {
+    while (!done)
+    {
+        if (DEBUG > 1)
+        {
             log_info("pos size[] = [");
-            for (int k=0; k<DEPTH; k++)
-                log_info(" %d ", pos[k]);
+            for (int k = 0; k < DEPTH; k++) log_info(" %d ", pos[k]);
             log_info("]\n");
         }
 
-        // Go through the selected vector sizes and see if the first n of them fit the
+        // Go through the selected vector sizes and see if the first n of them
+        // fit the
         //  required size exactly.
         int size_so_far = 0;
         int vloads;
-        for ( vloads=0; vloads<DEPTH; vloads++) {
-            if (size_so_far + sizes[pos[vloads]] <= max_size) {
+        for (vloads = 0; vloads < DEPTH; vloads++)
+        {
+            if (size_so_far + sizes[pos[vloads]] <= max_size)
+            {
                 size_so_far += sizes[pos[vloads]];
-            } else {
+            }
+            else
+            {
                 break;
             }
         }
-        if (DEBUG > 1)  log_info("vloads: %d, size_so_far:%d\n", vloads, size_so_far);
+        if (DEBUG > 1)
+            log_info("vloads: %d, size_so_far:%d\n", vloads, size_so_far);
 
-        // If they did not fit the required size exactly it is too long, so there is no point in checking any other combinations
+        // If they did not fit the required size exactly it is too long, so
+        // there is no point in checking any other combinations
         //  of the sizes to the right. Prune them from the search.
-        if (size_so_far != max_size) {
+        if (size_so_far != max_size)
+        {
             // Zero all the sizes to the right
-            for (int k=vloads+1; k<DEPTH; k++) {
+            for (int k = vloads + 1; k < DEPTH; k++)
+            {
                 pos[k] = 0;
             }
             // Increment this current size and propagate the values up if needed
-            for (int d=vloads; d>=0; d--) {
+            for (int d = vloads; d >= 0; d--)
+            {
                 pos[d]++;
-                if (pos[d] >= number_of_sizes) {
+                if (pos[d] >= number_of_sizes)
+                {
                     pos[d] = 0;
-                    if (d == 0) {
+                    if (d == 0)
+                    {
                         // If we rolled over then we are done
                         done = 1;
                         break;
                     }
-                } else {
+                }
+                else
+                {
                     break;
                 }
             }
-            // Go on to the next size since this one (and all others "under" it) didn't fit
+            // Go on to the next size since this one (and all others "under" it)
+            // didn't fit
             continue;
         }
 
 
         // Generate the actual load line if we are building this part
-        line[0]= '\0';
-        if (skip_to_result == 0 || total_results >= skip_to_result) {
-            if( number_of_sizes == 3 )
+        line[0] = '\0';
+        if (skip_to_result == 0 || total_results >= skip_to_result)
+        {
+            if (number_of_sizes == 3)
             {
-                sprintf( storePrefix, "vstore3( " );
-                sprintf( storeSuffix, ", %d, result )", current_result );
+                sprintf(storePrefix, "vstore3( ");
+                sprintf(storeSuffix, ", %d, result )", current_result);
             }
             else
             {
-                sprintf( storePrefix, "result[%d] = ", current_result );
-                storeSuffix[ 0 ] = 0;
+                sprintf(storePrefix, "result[%d] = ", current_result);
+                storeSuffix[0] = 0;
             }
 
-            sprintf(line, "\t%s(%s%d)(", storePrefix, get_explicit_type_name(type), output_size);
+            sprintf(line, "\t%s(%s%d)(", storePrefix,
+                    get_explicit_type_name(type), output_size);
             current_result++;
 
             int offset = 0;
-            for (int i=0; i<vloads; i++) {
+            for (int i = 0; i < vloads; i++)
+            {
                 if (pos[i] == 0)
                     sprintf(line + strlen(line), "src[%d]", offset);
                 else
-                    sprintf(line + strlen(line), "vload%s(0,src+%d)", size_names[pos[i]], offset);
+                    sprintf(line + strlen(line), "vload%s(0,src+%d)",
+                            size_names[pos[i]], offset);
                 offset += sizes[pos[i]];
-                if (i<(vloads-1))
-                    sprintf(line + strlen(line), ",");
+                if (i < (vloads - 1)) sprintf(line + strlen(line), ",");
             }
             sprintf(line + strlen(line), ")%s;\n", storeSuffix);
 
@@ -171,7 +187,8 @@ int create_kernel(ExplicitType type, int output_size, char *program, int *number
         }
         total_results++;
         total_program_length += (int)strlen(line);
-        if (total_program_length > MAX_CODE_SIZE) {
+        if (total_program_length > MAX_CODE_SIZE)
+        {
             aborted_due_to_size = 1;
             done = 1;
         }
@@ -179,132 +196,194 @@ int create_kernel(ExplicitType type, int output_size, char *program, int *number
 
         if (DEBUG) log_info("line is: %s", line);
 
-        // If we did not use all of them, then we ignore any changes further to the right.
-        // We do this by causing those loops to skip on the next iteration.
-        if (vloads < DEPTH) {
+        // If we did not use all of them, then we ignore any changes further to
+        // the right. We do this by causing those loops to skip on the next
+        // iteration.
+        if (vloads < DEPTH)
+        {
             if (DEBUG > 1) log_info("done with this depth\n");
-            for (int k=vloads; k<DEPTH; k++)
-                pos[k] = number_of_sizes;
+            for (int k = vloads; k < DEPTH; k++) pos[k] = number_of_sizes;
         }
 
         // Increment the far right size by 1, rolling over as needed
-        for (int d=DEPTH-1; d>=0; d--) {
+        for (int d = DEPTH - 1; d >= 0; d--)
+        {
             pos[d]++;
-            if (pos[d] >= number_of_sizes) {
+            if (pos[d] >= number_of_sizes)
+            {
                 pos[d] = 0;
-                if (d == 0) {
+                if (d == 0)
+                {
                     // If we rolled over at the far-left then we are done
                     done = 1;
                     break;
                 }
-            } else {
+            }
+            else
+            {
                 break;
             }
         }
-        if (done)
-            break;
+        if (done) break;
 
         // Continue until we are done.
     }
-    strcat(program, "}\n\n"); //log_info("%s\n", program);
+    strcat(program, "}\n\n"); // log_info("%s\n", program);
     total_program_length += 3;
-    if (DEBUG) log_info("\t\t(Program for vector type %s%s contains %d vector creations, of total program length %gkB, with a total of %d vloads.)\n",
-                        get_explicit_type_name(type), size_names[number_of_sizes-1], total_results, total_program_length/1024.0, total_vloads);
+    if (DEBUG)
+        log_info(
+            "\t\t(Program for vector type %s%s contains %d vector creations, "
+            "of total program length %gkB, with a total of %d vloads.)\n",
+            get_explicit_type_name(type), size_names[number_of_sizes - 1],
+            total_results, total_program_length / 1024.0, total_vloads);
     *number_of_results = current_result;
-    if (aborted_due_to_size)
-        return total_results;
+    if (aborted_due_to_size) return total_results;
     return 0;
 }
 
 
-
-
-int test_vector_creation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_vector_creation(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
-    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16};
+    const std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                                kInt,   kUInt,  kLong,  kULong,
+                                                kFloat, kHalf,  kDouble };
+    // should be in sync with global array size_names
+    const std::vector<unsigned int> vecSizes = { 1, 2, 3, 4, 8, 16 };
 
-    char *program_source;
-    int error;
+    int error = CL_SUCCESS;
     int total_errors = 0;
+    int number_of_results = 0;
 
-    cl_int input_data_int[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-    cl_double input_data_double[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-    void *input_data_converted;
-    void *output_data;
-
-    int number_of_results;;
-
-    input_data_converted = malloc(sizeof(cl_double)*16);
-    program_source = (char*)malloc(sizeof(char)*1024*1024*4);
+    std::vector<char> input_data_converted(sizeof(cl_double) * 16);
+    std::vector<char> program_source(sizeof(char) * 1024 * 1024 * 4);
+    std::vector<char> output_data;
 
     // Iterate over all the types
-    for (int type_index=0; type_index<10; type_index++) {
-    if(!gHasLong && ((vecType[type_index] == kLong)  || (vecType[type_index] == kULong)))
+    for (int type_index = 0; type_index < vecType.size(); type_index++)
     {
-      log_info("Long/ULong data type not supported on this device\n");
-      continue;
-    }
-
-        clMemWrapper input;
 
-        if (vecType[type_index] == kDouble) {
-            if (!is_extension_available(deviceID, "cl_khr_fp64")) {
-                log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
+        if (!gHasLong
+            && ((vecType[type_index] == kLong)
+                || (vecType[type_index] == kULong)))
+        {
+            log_info("Long/ULong data type not supported on this device\n");
+            continue;
+        }
+        else if (vecType[type_index] == kDouble)
+        {
+            if (!is_extension_available(deviceID, "cl_khr_fp64"))
+            {
+                log_info("Extension cl_khr_fp64 not supported; skipping double "
+                         "tests.\n");
                 continue;
             }
-            log_info("Testing doubles.\n");
+            snprintf(extension, sizeof(extension), "%s",
+                     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
         }
+        else if (vecType[type_index] == kHalf)
+        {
+            if (!is_extension_available(deviceID, "cl_khr_fp16"))
+            {
+                log_info("Extension cl_khr_fp16 not supported; skipping half "
+                         "tests.\n");
+                continue;
+            }
+            snprintf(extension, sizeof(extension), "%s",
+                     "#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
+        }
+
+        log_info("Testing %s.\n", get_explicit_type_name(vecType[type_index]));
 
         // Convert the data to the right format for the test.
-        memset(input_data_converted, 0xff, sizeof(cl_double)*16);
-        if (vecType[type_index] != kDouble) {
-            for (int j=0; j<16; j++) {
-                convert_explicit_value(&input_data_int[j], ((char*)input_data_converted)+get_explicit_type_size(vecType[type_index])*j,
-                                       kInt, 0, kRoundToEven, vecType[type_index]);
+        memset(input_data_converted.data(), 0xff, sizeof(cl_double) * 16);
+        if (vecType[type_index] == kDouble)
+        {
+            const cl_double input_data_double[16] = { 0,  1,  2,  3, 4,  5,
+                                                      6,  7,  8,  9, 10, 11,
+                                                      12, 13, 14, 15 };
+            memcpy(input_data_converted.data(), &input_data_double,
+                   sizeof(cl_double) * 16);
+        }
+        else if (vecType[type_index] == kHalf)
+        {
+            cl_half *buf =
+                reinterpret_cast<cl_half *>(input_data_converted.data());
+            for (int j = 0; j < 16; j++)
+                buf[j] = cl_half_from_float(float(j), CL_HALF_RTE);
+        }
+        else
+        {
+            for (int j = 0; j < 16; j++)
+            {
+                convert_explicit_value(
+                    &j,
+                    ((char *)input_data_converted.data())
+                        + get_explicit_type_size(vecType[type_index]) * j,
+                    kInt, 0, kRoundToEven, vecType[type_index]);
             }
-        } else {
-            memcpy(input_data_converted, &input_data_double, sizeof(cl_double)*16);
         }
 
-        input = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, get_explicit_type_size(vecType[type_index])*16,
-                               (vecType[type_index] != kDouble) ? input_data_converted : input_data_double, &error);
-        if (error) {
+        clMemWrapper input =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           get_explicit_type_size(vecType[type_index]) * 16,
+                           input_data_converted.data(), &error);
+        if (error)
+        {
             print_error(error, "clCreateBuffer failed");
             total_errors++;
             continue;
         }
 
         // Iterate over all the vector sizes.
-        for (int size_index=1; size_index< 5; size_index++) {
-            size_t global[] = {1,1,1};
+        for (int size_index = 1; size_index < vecSizes.size(); size_index++)
+        {
+            size_t global[] = { 1, 1, 1 };
             int number_generated = -1;
             int previous_number_generated = 0;
 
-            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[type_index]), size_names[size_index]);
-            while (number_generated != 0) {
+            log_info("Testing %s%s...\n",
+                     get_explicit_type_name(vecType[type_index]),
+                     size_names[size_index]);
+            while (number_generated != 0)
+            {
                 clMemWrapper output;
                 clKernelWrapper kernel;
                 clProgramWrapper program;
 
-                number_generated = create_kernel(vecType[type_index], vecSizes[size_index], program_source, &number_of_results, number_generated);
-                if (number_generated != 0) {
+                number_generated =
+                    create_kernel(vecType[type_index], vecSizes[size_index],
+                                  program_source.data(), &number_of_results,
+                                  number_generated);
+                if (number_generated != 0)
+                {
                     if (previous_number_generated == 0)
-                        log_info("Code size greater than %gkB; splitting test into multiple kernels.\n", MAX_CODE_SIZE/1024.0);
-                    log_info("\tExecuting vector permutations %d to %d...\n", previous_number_generated, number_generated-1);
+                        log_info("Code size greater than %gkB; splitting test "
+                                 "into multiple kernels.\n",
+                                 MAX_CODE_SIZE / 1024.0);
+                    log_info("\tExecuting vector permutations %d to %d...\n",
+                             previous_number_generated, number_generated - 1);
                 }
 
-                error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&program_source, "test_vector_creation");
-                if (error) {
+                char *src = program_source.data();
+                error = create_single_kernel_helper(context, &program, &kernel,
+                                                    1, (const char **)&src,
+                                                    "test_vector_creation");
+                if (error)
+                {
                     log_error("create_single_kernel_helper failed.\n");
                     total_errors++;
                     break;
                 }
 
-                output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                                        number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
-                                        NULL, &error);
-                if (error) {
+                output = clCreateBuffer(
+                    context, CL_MEM_WRITE_ONLY,
+                    number_of_results
+                        * get_explicit_type_size(vecType[type_index])
+                        * vecSizes[size_index],
+                    NULL, &error);
+                if (error)
+                {
                     print_error(error, "clCreateBuffer failed");
                     total_errors++;
                     break;
@@ -312,95 +391,115 @@ int test_vector_creation(cl_device_id deviceID, cl_context context, cl_command_q
 
                 error = clSetKernelArg(kernel, 0, sizeof(input), &input);
                 error |= clSetKernelArg(kernel, 1, sizeof(output), &output);
-                if (error) {
+                if (error)
+                {
                     print_error(error, "clSetKernelArg failed");
                     total_errors++;
                     break;
                 }
 
-                error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
-                if (error) {
+                error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global,
+                                               NULL, 0, NULL, NULL);
+                if (error)
+                {
                     print_error(error, "clEnqueueNDRangeKernel failed");
                     total_errors++;
                     break;
                 }
 
                 error = clFinish(queue);
-                if (error) {
+                if (error)
+                {
                     print_error(error, "clFinish failed");
                     total_errors++;
                     break;
                 }
 
-                output_data = malloc(number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
-                if (output_data == NULL) {
-                    log_error("Failed to allocate memory for output data.\n");
-                    total_errors++;
-                    break;
-                }
-                memset(output_data, 0xff, number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
-                error = clEnqueueReadBuffer(queue, output, CL_TRUE, 0,
-                                            number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
-                                            output_data, 0, NULL, NULL);
-                if (error) {
+                output_data.resize(number_of_results
+                                   * get_explicit_type_size(vecType[type_index])
+                                   * vecSizes[size_index]);
+                memset(output_data.data(), 0xff,
+                       number_of_results
+                           * get_explicit_type_size(vecType[type_index])
+                           * vecSizes[size_index]);
+                error = clEnqueueReadBuffer(
+                    queue, output, CL_TRUE, 0,
+                    number_of_results
+                        * get_explicit_type_size(vecType[type_index])
+                        * vecSizes[size_index],
+                    output_data.data(), 0, NULL, NULL);
+                if (error)
+                {
                     print_error(error, "clEnqueueReadBuffer failed");
                     total_errors++;
-                    free(output_data);
                     break;
                 }
 
                 // Check the results
-                char *res = (char *)output_data;
-                char *exp = (char *)input_data_converted;
-                for (int i=0; i<number_of_results; i++) {
+                char *res = (char *)output_data.data();
+                char *exp = (char *)input_data_converted.data();
+                for (int i = 0; i < number_of_results; i++)
+                {
                     // If they do not match, then print out why
-                    if (memcmp(input_data_converted,
-                               res + i*(get_explicit_type_size(vecType[type_index])*vecSizes[size_index]),
-                               get_explicit_type_size(vecType[type_index])*vecSizes[size_index])
-                        ) {
+                    if (memcmp(exp,
+                               res
+                                   + i
+                                       * (get_explicit_type_size(
+                                              vecType[type_index])
+                                          * vecSizes[size_index]),
+                               get_explicit_type_size(vecType[type_index])
+                                   * vecSizes[size_index]))
+                    {
                         log_error("Data failed to validate for result %d\n", i);
 
-                        // Find the line in the program that failed. This is ugly.
-                        char search[32];
-                        char found_line[1024];
-                        found_line[0]='\0';
-                        search[0]='\0';
+                        // Find the line in the program that failed. This is
+                        // ugly.
+                        char search[32] = { 0 };
+                        char found_line[1024] = { 0 };
                         sprintf(search, "result[%d] = (", i);
-                        char *start_loc = strstr(program_source, search);
+                        char *start_loc = strstr(program_source.data(), search);
                         if (start_loc == NULL)
-                            log_error("Failed to find program source for failure for %s in \n%s", search, program_source);
-                        else {
-                          char *end_loc = strstr(start_loc, "\n");
-                          memcpy(&found_line, start_loc, (end_loc-start_loc));
-                          found_line[end_loc-start_loc]='\0';
-                          log_error("Failed vector line: %s\n", found_line);
+                            log_error("Failed to find program source for "
+                                      "failure for %s in \n%s",
+                                      search, program_source.data());
+                        else
+                        {
+                            char *end_loc = strstr(start_loc, "\n");
+                            memcpy(&found_line, start_loc,
+                                   (end_loc - start_loc));
+                            found_line[end_loc - start_loc] = '\0';
+                            log_error("Failed vector line: %s\n", found_line);
                         }
 
-                        for (int j=0; j<(int)vecSizes[size_index]; j++) {
-                            char expected_value[64];
-                            char returned_value[64];
-                            expected_value[0]='\0';
-                            returned_value[0]='\0';
-                            print_type_to_string(vecType[type_index], (void*)(res+get_explicit_type_size(vecType[type_index])*(i*vecSizes[size_index]+j)), returned_value);
-                            print_type_to_string(vecType[type_index], (void*)(exp+get_explicit_type_size(vecType[type_index])*j), expected_value);
-                            log_error("index [%d, component %d]: got: %s expected: %s\n", i, j,
-                                      returned_value, expected_value);
+                        for (int j = 0; j < (int)vecSizes[size_index]; j++)
+                        {
+                            char expected_value[64] = { 0 };
+                            char returned_value[64] = { 0 };
+                            print_type_to_string(
+                                vecType[type_index],
+                                (void *)(res
+                                         + get_explicit_type_size(
+                                               vecType[type_index])
+                                             * (i * vecSizes[size_index] + j)),
+                                returned_value);
+                            print_type_to_string(
+                                vecType[type_index],
+                                (void *)(exp
+                                         + get_explicit_type_size(
+                                               vecType[type_index])
+                                             * j),
+                                expected_value);
+                            log_error("index [%d, component %d]: got: %s "
+                                      "expected: %s\n",
+                                      i, j, returned_value, expected_value);
                         }
-
                         total_errors++;
                     }
                 }
-                free(output_data);
                 previous_number_generated = number_generated;
             } // number_generated != 0
-
         } // vector sizes
     } // vector types
 
-    free(input_data_converted);
-    free(program_source);
-
     return total_errors;
 }
-
-

From 44b2578ac78b7d559f9055f11b39ad256606f578 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Fri, 16 Jun 2023 10:53:08 +0100
Subject: [PATCH 05/20] basic: fix unused-but-set variables (#1764)

Remove the unused `numItems` variable.

As this fixes all occurrences of this warning in test_basic, remove
the suppression flag.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_conformance/basic/CMakeLists.txt               | 2 --
 test_conformance/basic/test_work_item_functions.cpp | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index adf24bd80d..c07d32b661 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -70,6 +70,4 @@ if(APPLE)
     list(APPEND ${MODULE_NAME}_SOURCES test_queue_priority.cpp)
 endif(APPLE)
 
-set_gnulike_module_compile_flags("-Wno-unused-but-set-variable")
-
 include(../CMakeCommon.txt)
diff --git a/test_conformance/basic/test_work_item_functions.cpp b/test_conformance/basic/test_work_item_functions.cpp
index d95915cf53..9683a8342f 100644
--- a/test_conformance/basic/test_work_item_functions.cpp
+++ b/test_conformance/basic/test_work_item_functions.cpp
@@ -91,7 +91,6 @@ int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_comma
     {
         for( int i = 0; i < NUM_TESTS; i++  )
         {
-            size_t numItems = 1;
             for( size_t j = 0; j < dim; j++ )
             {
                 // All of our thread sizes should be within the max local sizes, since they're all <= 20
@@ -100,8 +99,6 @@ int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_comma
                 while( localThreads[ j ] > 1 && ( threads[ j ] % localThreads[ j ] != 0 ) )
                     localThreads[ j ]--;
 
-                numItems *= threads[ j ];
-
                 // Hack for now: localThreads > 1 are iffy
                 localThreads[ j ] = 1;
             }

From 0e229b8f01afc9e16ca83234b656830c26f11215 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 20 Jun 2023 17:42:57 +0200
Subject: [PATCH 06/20] Added cl_khr_fp16 extension support for test_fpmath
 from basic (#1718)

* Added half and double support for fpmath test from basic (issue #142, basic)

* Cosmetic corrections due to code review

* Removed unnecessary casting

* Added corrections due to code review

* Tuning range of input generation to avoid hitting infinity

* Moved string helpers procedures due to request from test_commonfns PR #1695
---
 .../harness/stringHelpers.h                   |   0
 test_conformance/basic/CMakeLists.txt         |   2 +-
 test_conformance/basic/main.cpp               |  37 +-
 test_conformance/basic/procs.h                |  10 +-
 test_conformance/basic/test_astype.cpp        |   7 +-
 test_conformance/basic/test_fpmath.cpp        | 386 ++++++++++++++++++
 test_conformance/basic/test_fpmath_float.cpp  | 196 ---------
 7 files changed, 427 insertions(+), 211 deletions(-)
 rename test_conformance/basic/utils.h => test_common/harness/stringHelpers.h (100%)
 create mode 100644 test_conformance/basic/test_fpmath.cpp
 delete mode 100644 test_conformance/basic/test_fpmath_float.cpp

diff --git a/test_conformance/basic/utils.h b/test_common/harness/stringHelpers.h
similarity index 100%
rename from test_conformance/basic/utils.h
rename to test_common/harness/stringHelpers.h
diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index c07d32b661..c89a93cf0e 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -2,7 +2,7 @@ set(MODULE_NAME BASIC)
 
 set(${MODULE_NAME}_SOURCES
     main.cpp
-    test_fpmath_float.cpp
+    test_fpmath.cpp
     test_intmath.cpp
     test_hiloeo.cpp test_local.cpp test_pointercast.cpp
     test_if.cpp test_loop.cpp
diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp
index 86c3cec359..24262dbf99 100644
--- a/test_conformance/basic/main.cpp
+++ b/test_conformance/basic/main.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,14 +22,15 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <CL/cl_half.h>
+
 #include "harness/testHarness.h"
 #include "procs.h"
 
 test_definition test_list[] = {
     ADD_TEST(hostptr),
-    ADD_TEST(fpmath_float),
-    ADD_TEST(fpmath_float2),
-    ADD_TEST(fpmath_float4),
+    ADD_TEST(fpmath),
     ADD_TEST(intmath_int),
     ADD_TEST(intmath_int2),
     ADD_TEST(intmath_int4),
@@ -164,9 +165,35 @@ test_definition test_list[] = {
 };
 
 const int test_num = ARRAY_SIZE( test_list );
+cl_half_rounding_mode halfRoundingMode = CL_HALF_RTE;
+
+test_status InitCL(cl_device_id device)
+{
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            halfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            halfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
+    }
+
+    return TEST_PASS;
+}
 
 int main(int argc, const char *argv[])
 {
-    return runTestHarness(argc, argv, test_num, test_list, false, 0);
+    return runTestHarnessWithCheck(argc, argv, test_num, test_list, false, 0,
+                                   InitCL);
 }
 
diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h
index c14340de34..9cbc373a3a 100644
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "harness/kernelHelpers.h"
 #include "harness/testHarness.h"
 #include "harness/errorHelpers.h"
@@ -21,9 +22,8 @@
 #include "harness/rounding_mode.h"
 
 extern int      test_hostptr(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_fpmath_float4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_fpmath(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
 extern int      test_intmath_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_intmath_int2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_intmath_int4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/basic/test_astype.cpp b/test_conformance/basic/test_astype.cpp
index 08a4cb85aa..45669a7cbd 100644
--- a/test_conformance/basic/test_astype.cpp
+++ b/test_conformance/basic/test_astype.cpp
@@ -14,6 +14,9 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/conversions.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
 
 #include <limits.h>
 #include <stdio.h>
@@ -22,11 +25,7 @@
 #include <sys/stat.h>
 #include <vector>
 
-#include "harness/conversions.h"
-#include "harness/typeWrappers.h"
-
 #include "procs.h"
-#include "utils.h"
 
 // clang-format off
 
diff --git a/test_conformance/basic/test_fpmath.cpp b/test_conformance/basic/test_fpmath.cpp
new file mode 100644
index 0000000000..6719e72816
--- /dev/null
+++ b/test_conformance/basic/test_fpmath.cpp
@@ -0,0 +1,386 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "harness/compat.h"
+#include "harness/rounding_mode.h"
+#include "harness/stringHelpers.h"
+
+#include <CL/cl_half.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "procs.h"
+
+static const char *fp_kernel_code = R"(
+%s
+__kernel void test_fp(__global TYPE *srcA, __global TYPE *srcB, __global TYPE *dst)
+{
+    int  tid = get_global_id(0);
+
+    dst[tid] = srcA[tid] OP srcB[tid];
+})";
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+template <typename T> double toDouble(T val)
+{
+    if (std::is_same<cl_half, T>::value)
+        return HTF(val);
+    else
+        return val;
+}
+
+bool isHalfNan(cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (v >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = v & 0x3FF;
+
+    // NaN test
+    return (h_exp == 0x1F && h_mant != 0);
+}
+
+cl_half half_plus(cl_half a, cl_half b)
+{
+    return HFF(std::plus<float>()(HTF(a), HTF(b)));
+}
+
+cl_half half_minus(cl_half a, cl_half b)
+{
+    return HFF(std::minus<float>()(HTF(a), HTF(b)));
+}
+
+cl_half half_mult(cl_half a, cl_half b)
+{
+    return HFF(std::multiplies<float>()(HTF(a), HTF(b)));
+}
+
+template <typename T> struct TestDef
+{
+    const char op;
+    std::function<T(T, T)> ref;
+    std::string type_str;
+    size_t vec_size;
+};
+
+template <typename T>
+int verify_fp(std::vector<T> (&input)[2], std::vector<T> &output,
+              const TestDef<T> &test)
+{
+    auto &inA = input[0];
+    auto &inB = input[1];
+    for (int i = 0; i < output.size(); i++)
+    {
+        bool nan_test = false;
+
+        T r = test.ref(inA[i], inB[i]);
+
+        if (std::is_same<T, cl_half>::value)
+            nan_test = !(isHalfNan(r) && isHalfNan(output[i]));
+
+        if (r != output[i] && nan_test)
+        {
+            log_error("FP math test for type: %s, vec size: %zu, failed at "
+                      "index %d, %a '%c' %a, expected %a, get %a\n",
+                      test.type_str.c_str(), test.vec_size, i, toDouble(inA[i]),
+                      test.op, toDouble(inB[i]), toDouble(r),
+                      toDouble(output[i]));
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+template <typename T> void generate_random_inputs(std::vector<T> (&input)[2])
+{
+    RandomSeed seed(gRandomSeed);
+
+    if (std::is_same<T, float>::value)
+    {
+        auto random_generator = [&seed]() {
+            return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                    MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), seed);
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        auto random_generator = [&seed]() {
+            return get_random_double(-MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63),
+                                     MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63),
+                                     seed);
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+    else
+    {
+        auto random_generator = [&seed]() {
+            return HFF(get_random_float(-MAKE_HEX_FLOAT(0x1.0p8f, 0x1, 8),
+                                        MAKE_HEX_FLOAT(0x1.0p8f, 0x1, 8),
+                                        seed));
+        };
+        for (auto &v : input)
+            std::generate(v.begin(), v.end(), random_generator);
+    }
+}
+
+struct TypesIterator
+{
+    using TypeIter = std::tuple<cl_float, cl_half, cl_double>;
+
+    TypesIterator(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elems)
+        : context(context), queue(queue), fpConfigHalf(0), fpConfigFloat(0),
+          num_elements(num_elems)
+    {
+        // typeid().name one day
+        type2name[sizeof(cl_half)] = "half";
+        type2name[sizeof(cl_float)] = "float";
+        type2name[sizeof(cl_double)] = "double";
+
+        fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+        fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
+
+        fpConfigFloat = get_default_rounding_mode(deviceID);
+
+        if (fp16Support)
+            fpConfigHalf =
+                get_default_rounding_mode(deviceID, CL_DEVICE_HALF_FP_CONFIG);
+
+        for_each_elem(it);
+    }
+
+    template <typename T> int test_fpmath(TestDef<T> &test)
+    {
+        constexpr size_t vecSizes[] = { 1, 2, 4, 8, 16 };
+        cl_int err = CL_SUCCESS;
+
+        std::ostringstream sstr;
+        if (std::is_same<T, double>::value)
+            sstr << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+        if (std::is_same<T, cl_half>::value)
+            sstr << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
+        std::string program_source =
+            str_sprintf(std::string(fp_kernel_code), sstr.str().c_str());
+
+        for (unsigned i = 0; i < ARRAY_SIZE(vecSizes); i++)
+        {
+            test.vec_size = vecSizes[i];
+
+            std::ostringstream vecNameStr;
+            vecNameStr << test.type_str;
+            if (test.vec_size != 1) vecNameStr << test.vec_size;
+
+            clMemWrapper streams[3];
+            clProgramWrapper program;
+            clKernelWrapper kernel;
+
+            size_t length = sizeof(T) * num_elements * test.vec_size;
+
+            bool isRTZ = false;
+            RoundingMode oldMode = kDefaultRoundingMode;
+
+
+            // If we only support rtz mode
+            if (std::is_same<T, cl_half>::value)
+            {
+                if (CL_FP_ROUND_TO_ZERO == fpConfigHalf)
+                {
+                    isRTZ = true;
+                    oldMode = get_round();
+                }
+            }
+            else if (std::is_same<T, float>::value)
+            {
+                if (CL_FP_ROUND_TO_ZERO == fpConfigFloat)
+                {
+                    isRTZ = true;
+                    oldMode = get_round();
+                }
+            }
+
+            std::vector<T> inputs[]{
+                std::vector<T>(test.vec_size * num_elements),
+                std::vector<T>(test.vec_size * num_elements)
+            };
+            std::vector<T> output =
+                std::vector<T>(test.vec_size * num_elements);
+
+            generate_random_inputs<T>(inputs);
+
+            for (int i = 0; i < ARRAY_SIZE(streams); i++)
+            {
+                streams[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, length,
+                                            NULL, &err);
+                test_error(err, "clCreateBuffer failed.");
+            }
+            for (int i = 0; i < ARRAY_SIZE(inputs); i++)
+            {
+                err =
+                    clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0, length,
+                                         inputs[i].data(), 0, NULL, NULL);
+                test_error(err, "clEnqueueWriteBuffer failed.");
+            }
+
+            std::string build_options = "-DTYPE=";
+            build_options.append(vecNameStr.str())
+                .append(" -DOP=")
+                .append(1, test.op);
+
+            const char *ptr = program_source.c_str();
+            err =
+                create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                            "test_fp", build_options.c_str());
+
+            test_error(err, "create_single_kernel_helper failed");
+
+            for (int i = 0; i < ARRAY_SIZE(streams); i++)
+            {
+                err =
+                    clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+                test_error(err, "clSetKernelArgs failed.");
+            }
+
+            size_t threads[] = { static_cast<size_t>(num_elements) };
+            err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL,
+                                         0, NULL, NULL);
+            test_error(err, "clEnqueueNDRangeKernel failed.");
+
+            err = clEnqueueReadBuffer(queue, streams[2], CL_TRUE, 0, length,
+                                      output.data(), 0, NULL, NULL);
+            test_error(err, "clEnqueueReadBuffer failed.");
+
+            if (isRTZ) set_round(kRoundTowardZero, kfloat);
+
+            err = verify_fp(inputs, output, test);
+
+            if (isRTZ) set_round(oldMode, kfloat);
+
+            test_error(err, "test verification failed");
+            log_info("FP '%c' '%s' test passed\n", test.op,
+                     vecNameStr.str().c_str());
+        }
+
+        return err;
+    }
+
+    template <typename T> int test_fpmath_common()
+    {
+        int err = TEST_PASS;
+        if (std::is_same<cl_half, T>::value)
+        {
+            TestDef<T> tests[] = { { '+', half_plus, type2name[sizeof(T)] },
+                                   { '-', half_minus, type2name[sizeof(T)] },
+                                   { '*', half_mult, type2name[sizeof(T)] } };
+            for (auto &test : tests) err |= test_fpmath<T>(test);
+        }
+        else
+        {
+            TestDef<T> tests[] = {
+                { '+', std::plus<T>(), type2name[sizeof(T)] },
+                { '-', std::minus<T>(), type2name[sizeof(T)] },
+                { '*', std::multiplies<T>(), type2name[sizeof(T)] }
+            };
+            for (auto &test : tests) err |= test_fpmath<T>(test);
+        }
+
+        return err;
+    }
+
+    template <typename T> bool skip_type()
+    {
+        if (std::is_same<double, T>::value && !fp64Support)
+            return true;
+        else if (std::is_same<cl_half, T>::value && !fp16Support)
+            return true;
+        return false;
+    }
+
+    template <std::size_t Cnt = 0, typename Type>
+    void iterate_type(const Type &t)
+    {
+        bool doTest = !skip_type<Type>();
+
+        if (doTest)
+        {
+            if (test_fpmath_common<Type>())
+            {
+                throw std::runtime_error("test_fpmath_common failed\n");
+            }
+        }
+    }
+
+    template <std::size_t Cnt = 0, typename... Tp>
+    inline typename std::enable_if<Cnt == sizeof...(Tp), void>::type
+    for_each_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t Cnt = 0, typename... Tp>
+        inline typename std::enable_if < Cnt<sizeof...(Tp), void>::type
+        for_each_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_type<Cnt>(std::get<Cnt>(t));
+        for_each_elem<Cnt + 1, Tp...>(t);
+    }
+
+protected:
+    TypeIter it;
+
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_device_fp_config fpConfigHalf;
+    cl_device_fp_config fpConfigFloat;
+
+    bool fp16Support;
+    bool fp64Support;
+
+    int num_elements;
+    std::map<size_t, std::string> type2name;
+};
+
+int test_fpmath(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator(device, context, queue, num_elements);
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
diff --git a/test_conformance/basic/test_fpmath_float.cpp b/test_conformance/basic/test_fpmath_float.cpp
deleted file mode 100644
index fced0f4ecc..0000000000
--- a/test_conformance/basic/test_fpmath_float.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "harness/rounding_mode.h"
-
-#include <algorithm>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "procs.h"
-
-struct TestDef
-{
-    const char op;
-    std::function<float(float, float)> ref;
-};
-
-static const char *fp_kernel_code = R"(
-__kernel void test_fp(__global TYPE *srcA, __global TYPE *srcB, __global TYPE *dst)
-{
-    int  tid = get_global_id(0);
-
-    dst[tid] = srcA[tid] OP srcB[tid];
-})";
-
-static int verify_fp(std::vector<float> (&input)[2], std::vector<float> &output,
-                     const TestDef &test)
-{
-
-    auto &inA = input[0];
-    auto &inB = input[1];
-    for (int i = 0; i < output.size(); i++)
-    {
-        float r = test.ref(inA[i], inB[i]);
-        if (r != output[i])
-        {
-            log_error("FP '%c' float test failed\n", test.op);
-            return -1;
-        }
-    }
-
-    log_info("FP '%c' float test passed\n", test.op);
-    return 0;
-}
-
-
-void generate_random_inputs(std::vector<cl_float> (&input)[2])
-{
-    RandomSeed seed(gRandomSeed);
-
-    auto random_generator = [&seed]() {
-        return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
-                                MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), seed);
-    };
-
-    for (auto &v : input)
-    {
-        std::generate(v.begin(), v.end(), random_generator);
-    }
-}
-
-template <size_t N>
-int test_fpmath(cl_device_id device, cl_context context, cl_command_queue queue,
-                int num_elements, const std::string type_str,
-                const TestDef &test)
-{
-    clMemWrapper streams[3];
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-
-    int err;
-
-    size_t length = sizeof(cl_float) * num_elements * N;
-
-    int isRTZ = 0;
-    RoundingMode oldMode = kDefaultRoundingMode;
-
-    // If we only support rtz mode
-    if (CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device))
-    {
-        isRTZ = 1;
-        oldMode = get_round();
-    }
-
-
-    std::vector<cl_float> inputs[]{ std::vector<cl_float>(N * num_elements),
-                                    std::vector<cl_float>(N * num_elements) };
-    std::vector<cl_float> output = std::vector<cl_float>(N * num_elements);
-
-    generate_random_inputs(inputs);
-
-    for (int i = 0; i < ARRAY_SIZE(streams); i++)
-    {
-        streams[i] =
-            clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, &err);
-        test_error(err, "clCreateBuffer failed.");
-    }
-    for (int i = 0; i < ARRAY_SIZE(inputs); i++)
-    {
-        err = clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0, length,
-                                   inputs[i].data(), 0, NULL, NULL);
-        test_error(err, "clEnqueueWriteBuffer failed.");
-    }
-
-    std::string build_options = "-DTYPE=";
-    build_options.append(type_str).append(" -DOP=").append(1, test.op);
-
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &fp_kernel_code, "test_fp",
-                                      build_options.c_str());
-
-    test_error(err, "create_single_kernel_helper failed");
-
-    for (int i = 0; i < ARRAY_SIZE(streams); i++)
-    {
-        err = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
-        test_error(err, "clSetKernelArgs failed.");
-    }
-
-    size_t threads[] = { static_cast<size_t>(num_elements) };
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL, 0, NULL,
-                                 NULL);
-    test_error(err, "clEnqueueNDRangeKernel failed.");
-
-    err = clEnqueueReadBuffer(queue, streams[2], CL_TRUE, 0, length,
-                              output.data(), 0, NULL, NULL);
-    test_error(err, "clEnqueueReadBuffer failed.");
-
-    if (isRTZ) set_round(kRoundTowardZero, kfloat);
-
-    err = verify_fp(inputs, output, test);
-
-    if (isRTZ) set_round(oldMode, kfloat);
-
-    return err;
-}
-
-
-template <size_t N>
-int test_fpmath_common(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements,
-                       const std::string type_str)
-{
-    TestDef tests[] = { { '+', std::plus<float>() },
-                        { '-', std::minus<float>() },
-                        { '*', std::multiplies<float>() } };
-    int err = TEST_PASS;
-
-    for (const auto &test : tests)
-    {
-        err |= test_fpmath<N>(device, context, queue, num_elements, type_str,
-                              test);
-    }
-
-    return err;
-}
-
-int test_fpmath_float(cl_device_id device, cl_context context,
-                      cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<1>(device, context, queue, num_elements, "float");
-}
-
-int test_fpmath_float2(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<2>(device, context, queue, num_elements,
-                                 "float2");
-}
-
-int test_fpmath_float4(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int num_elements)
-{
-    return test_fpmath_common<4>(device, context, queue, num_elements,
-                                 "float4");
-}

From df3ec8deecdb81661ee61c3c97ae63419b5f4822 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 20 Jun 2023 17:44:45 +0200
Subject: [PATCH 07/20] Added cl_khr_fp16 extension support for test_int2fp
 from basic (#1742)

* Added cl_khr_fp16 and cl_khr_fp64 support for float2int and int2float tests from basic

* removed debug output

* Replaced procedure to generate random half values in specific range (issue #142, basic)

* Added cosmetic fixes due to code review comments

* Moved string helper procedures due to request for test_commonfns PR #1695
---
 test_conformance/basic/CMakeLists.txt     |   2 +-
 test_conformance/basic/main.cpp           |   6 +-
 test_conformance/basic/procs.h            |   6 +-
 test_conformance/basic/test_int2float.cpp | 140 ----------
 test_conformance/basic/test_int2fp.cpp    | 324 ++++++++++++++++++++++
 5 files changed, 332 insertions(+), 146 deletions(-)
 delete mode 100644 test_conformance/basic/test_int2float.cpp
 create mode 100644 test_conformance/basic/test_int2fp.cpp

diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index c89a93cf0e..47c1c980f1 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -11,7 +11,7 @@ set(${MODULE_NAME}_SOURCES
     test_multireadimageonefmt.cpp test_multireadimagemultifmt.cpp
     test_imagedim.cpp
     test_vloadstore.cpp
-    test_int2float.cpp
+    test_int2fp.cpp
     test_createkernelsinprogram.cpp
     test_hostptr.cpp
     test_explicit_s2v.cpp
diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp
index 24262dbf99..d1901f95d6 100644
--- a/test_conformance/basic/main.cpp
+++ b/test_conformance/basic/main.cpp
@@ -59,8 +59,8 @@ test_definition test_list[] = {
     ADD_TEST(image_r8),
     ADD_TEST(barrier),
     ADD_TEST_VERSION(wg_barrier, Version(2, 0)),
-    ADD_TEST(int2float),
-    ADD_TEST(float2int),
+    ADD_TEST(int2fp),
+    ADD_TEST(fp2int),
     ADD_TEST(imagereadwrite),
     ADD_TEST(imagereadwrite3d),
     ADD_TEST(readimage3d),
@@ -156,7 +156,7 @@ test_definition test_list[] = {
     ADD_TEST(simple_read_image_pitch),
     ADD_TEST(simple_write_image_pitch),
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     ADD_TEST(queue_priority),
 #endif
 
diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h
index 9cbc373a3a..b685ecd53c 100644
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -52,8 +52,10 @@ extern int      test_image_r8(cl_device_id deviceID, cl_context context, cl_comm
 extern int      test_simplebarrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_wg_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_int2float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int      test_float2int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_int2fp(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
+extern int test_fp2int(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
 extern int      test_imagearraycopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_imagearraycopy3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_imagereadwrite(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/basic/test_int2float.cpp b/test_conformance/basic/test_int2float.cpp
deleted file mode 100644
index c5afc2440a..0000000000
--- a/test_conformance/basic/test_int2float.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "procs.h"
-
-namespace {
-const char *int2float_kernel_code = R"(
-__kernel void test_X2Y(__global TYPE_X *src, __global TYPE_Y *dst)
-{
-    int  tid = get_global_id(0);
-
-    dst[tid] = (TYPE_Y)src[tid];
-
-})";
-
-template <typename T> const char *Type2str() { return ""; }
-template <> const char *Type2str<cl_int>() { return "int"; }
-template <> const char *Type2str<cl_float>() { return "float"; }
-
-template <typename T> void generate_random_inputs(std::vector<T> &v)
-{
-    RandomSeed seed(gRandomSeed);
-
-    auto random_generator = [&seed]() {
-        return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
-                                MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), seed);
-    };
-
-    std::generate(v.begin(), v.end(), random_generator);
-}
-
-template <typename Tx, typename Ty> bool equal_value(Tx a, Ty b)
-{
-    return a == (Tx)b;
-}
-
-template <typename Tx, typename Ty>
-int verify_X2Y(std::vector<Tx> input, std::vector<Ty> output,
-               const char *test_name)
-{
-
-    if (!std::equal(output.begin(), output.end(), input.begin(),
-                    equal_value<Tx, Ty>))
-    {
-        log_error("%s test failed\n", test_name);
-        return -1;
-    }
-
-    log_info("%s test passed\n", test_name);
-    return 0;
-}
-template <typename Tx, typename Ty>
-int test_X2Y(cl_device_id device, cl_context context, cl_command_queue queue,
-             int num_elements, const char *test_name)
-{
-    clMemWrapper streams[2];
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-    int err;
-
-
-    std::vector<Tx> input(num_elements);
-    std::vector<Ty> output(num_elements);
-
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(Tx) * num_elements, nullptr, &err);
-    test_error(err, "clCreateBuffer failed.");
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(Ty) * num_elements, nullptr, &err);
-    test_error(err, "clCreateBuffer failed.");
-
-    generate_random_inputs(input);
-
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
-                               sizeof(Tx) * num_elements, input.data(), 0,
-                               nullptr, nullptr);
-    test_error(err, "clEnqueueWriteBuffer failed.");
-
-    std::string build_options;
-    build_options.append("-DTYPE_X=").append(Type2str<Tx>());
-    build_options.append(" -DTYPE_Y=").append(Type2str<Ty>());
-    err = create_single_kernel_helper(context, &program, &kernel, 1,
-                                      &int2float_kernel_code, "test_X2Y",
-                                      build_options.c_str());
-    test_error(err, "create_single_kernel_helper failed.");
-
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-    test_error(err, "clSetKernelArg failed.");
-
-    size_t threads[] = { (size_t)num_elements };
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, threads, nullptr, 0,
-                                 nullptr, nullptr);
-    test_error(err, "clEnqueueNDRangeKernel failed.");
-
-    err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
-                              sizeof(Ty) * num_elements, output.data(), 0,
-                              nullptr, nullptr);
-    test_error(err, "clEnqueueReadBuffer failed.");
-
-    err = verify_X2Y(input, output, test_name);
-
-    return err;
-}
-}
-int test_int2float(cl_device_id device, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return test_X2Y<cl_int, cl_float>(device, context, queue, num_elements,
-                                      "INT2FLOAT");
-}
-int test_float2int(cl_device_id device, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return test_X2Y<cl_float, cl_int>(device, context, queue, num_elements,
-                                      "FLOAT2INT");
-}
diff --git a/test_conformance/basic/test_int2fp.cpp b/test_conformance/basic/test_int2fp.cpp
new file mode 100644
index 0000000000..8b1203a71b
--- /dev/null
+++ b/test_conformance/basic/test_int2fp.cpp
@@ -0,0 +1,324 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "CL/cl_half.h"
+#include "harness/compat.h"
+#include "harness/errorHelpers.h"
+#include "harness/stringHelpers.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+#include "procs.h"
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+namespace {
+const char *int2float_kernel_code = R"(
+%s
+__kernel void test_X2Y(__global TYPE_X *src, __global TYPE_Y *dst)
+{
+    int  tid = get_global_id(0);
+
+    dst[tid] = (TYPE_Y)src[tid];
+
+})";
+
+template <bool int2fp> struct TypesIterator
+{
+    TypesIterator(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elems, const char *test_name)
+        : context(context), queue(queue), test_name(test_name),
+          num_elements(num_elems)
+    {
+        fp16Support = is_extension_available(deviceID, "cl_khr_fp16");
+        fp64Support = is_extension_available(deviceID, "cl_khr_fp64");
+
+        type2name[sizeof(cl_half)] = std::make_pair("half", "short");
+        type2name[sizeof(cl_float)] = std::make_pair("float", "int");
+        type2name[sizeof(cl_double)] = std::make_pair("double", "long");
+
+        std::tuple<cl_float, cl_half, cl_double> it;
+        for_each_elem(it);
+    }
+
+    template <typename T> void generate_random_inputs(std::vector<T> &v)
+    {
+        RandomSeed seed(gRandomSeed);
+
+        if (sizeof(T) == sizeof(cl_half))
+        {
+            // Bound generated half values to 0x1.ffcp+14(32752.0) which is the
+            // largest cl_half value smaller than the max value of cl_short,
+            // 32767.
+            if (int2fp)
+            {
+                auto random_generator = [&seed]() {
+                    return (cl_short)get_random_float(
+                        -MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14),
+                        MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14), seed);
+                };
+                std::generate(v.begin(), v.end(), random_generator);
+            }
+            else
+            {
+                auto random_generator = [&seed]() {
+                    return HFF(get_random_float(
+                        -MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14),
+                        MAKE_HEX_FLOAT(0x1.ffcp+14, 1.9990234375f, 14), seed));
+                };
+                std::generate(v.begin(), v.end(), random_generator);
+            }
+        }
+        else if (sizeof(T) == sizeof(cl_float))
+        {
+            auto random_generator = [&seed]() {
+                return get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                        MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31),
+                                        seed);
+            };
+            std::generate(v.begin(), v.end(), random_generator);
+        }
+        else if (sizeof(T) == sizeof(cl_double))
+        {
+            auto random_generator = [&seed]() {
+                return get_random_double(-MAKE_HEX_DOUBLE(0x1.0p63, 0x1, 63),
+                                         MAKE_HEX_DOUBLE(0x1.0p63, 0x1, 63),
+                                         seed);
+            };
+            std::generate(v.begin(), v.end(), random_generator);
+        }
+    }
+
+    template <typename Tx, typename Ty> static bool equal_value(Tx a, Ty b)
+    {
+        return a == (Tx)b;
+    }
+
+    static bool equal_value_from_half(cl_short a, cl_half b)
+    {
+        return a == (cl_short)HTF(b);
+    }
+
+    static bool equal_value_to_half(cl_half a, cl_short b)
+    {
+        return a == HFF((float)b);
+    }
+
+
+    template <typename Tx, typename Ty>
+    int verify_X2Y(std::vector<Tx> input, std::vector<Ty> output)
+    {
+        if (std::is_same<Tx, cl_half>::value
+            || std::is_same<Ty, cl_half>::value)
+        {
+            bool res = true;
+            if (int2fp)
+                res = std::equal(output.begin(), output.end(), input.begin(),
+                                 equal_value_to_half);
+            else
+                res = std::equal(output.begin(), output.end(), input.begin(),
+                                 equal_value_from_half);
+
+            if (!res)
+            {
+                log_error("%s test failed\n", test_name.c_str());
+                return -1;
+            }
+        }
+        else
+        {
+            if (!std::equal(output.begin(), output.end(), input.begin(),
+                            equal_value<Tx, Ty>))
+            {
+                log_error("%s test failed\n", test_name.c_str());
+                return -1;
+            }
+        }
+
+        log_info("%s test passed\n", test_name.c_str());
+        return 0;
+    }
+
+    template <typename Tx, typename Ty> int test_X2Y()
+    {
+        clMemWrapper streams[2];
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        int err;
+
+        std::vector<Tx> input(num_elements);
+        std::vector<Ty> output(num_elements);
+
+        streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                    sizeof(Tx) * num_elements, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
+        streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                    sizeof(Ty) * num_elements, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
+
+        generate_random_inputs(input);
+
+        err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                                   sizeof(Tx) * num_elements, input.data(), 0,
+                                   nullptr, nullptr);
+        test_error(err, "clEnqueueWriteBuffer failed.");
+
+        std::string src_name = type2name[sizeof(Tx)].first;
+        std::string dst_name = type2name[sizeof(Tx)].second;
+        if (int2fp) std::swap(src_name, dst_name);
+
+        std::string build_options;
+        build_options.append("-DTYPE_X=").append(src_name.c_str());
+        build_options.append(" -DTYPE_Y=").append(dst_name.c_str());
+
+        std::string extension;
+        if (sizeof(Tx) == sizeof(cl_double))
+            extension = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+        if (sizeof(Tx) == sizeof(cl_half))
+            extension = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
+        std::string kernelSource =
+            str_sprintf(int2float_kernel_code, extension.c_str());
+        const char *ptr = kernelSource.c_str();
+
+        err = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                          "test_X2Y", build_options.c_str());
+        test_error(err, "create_single_kernel_helper failed.");
+
+        err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+        err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+        test_error(err, "clSetKernelArg failed.");
+
+        size_t threads[] = { (size_t)num_elements };
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, threads,
+                                     nullptr, 0, nullptr, nullptr);
+        test_error(err, "clEnqueueNDRangeKernel failed.");
+
+        err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                  sizeof(Ty) * num_elements, output.data(), 0,
+                                  nullptr, nullptr);
+        test_error(err, "clEnqueueReadBuffer failed.");
+
+        err = verify_X2Y(input, output);
+
+        return err;
+    }
+
+    template <typename T> bool skip_type()
+    {
+        if (std::is_same<double, T>::value && !fp64Support)
+            return true;
+        else if (std::is_same<cl_half, T>::value && !fp16Support)
+            return true;
+        return false;
+    }
+
+    template <std::size_t Cnt = 0, typename T> void iterate_type(const T &t)
+    {
+        bool doTest = !skip_type<T>();
+
+        if (doTest)
+        {
+            typedef typename std::conditional<
+                (sizeof(T) == sizeof(std::int16_t)), std::int16_t,
+                typename std::conditional<(sizeof(T) == sizeof(std::int32_t)),
+                                          std::int32_t,
+                                          std::int64_t>::type>::type U;
+            if (int2fp)
+            {
+                if (test_X2Y<U, T>())
+                    throw std::runtime_error("test_X2Y failed\n");
+            }
+            else
+            {
+                if (test_X2Y<T, U>())
+                    throw std::runtime_error("test_X2Y failed\n");
+            }
+        }
+    }
+
+    template <std::size_t Cnt = 0, typename... Tp>
+    inline typename std::enable_if<Cnt == sizeof...(Tp), void>::type
+    for_each_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    template <std::size_t Cnt = 0, typename... Tp>
+        inline typename std::enable_if < Cnt<sizeof...(Tp), void>::type
+        for_each_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_type<Cnt>(std::get<Cnt>(t));
+        for_each_elem<Cnt + 1, Tp...>(t);
+    }
+
+protected:
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_device_fp_config fpConfigHalf;
+    cl_device_fp_config fpConfigFloat;
+
+    bool fp16Support;
+    bool fp64Support;
+
+    std::map<size_t, std::pair<std::string, std::string>> type2name;
+
+    std::string test_name;
+    int num_elements;
+};
+
+}
+
+int test_int2fp(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator<true>(device, context, queue, num_elements, "INT2FP");
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+int test_fp2int(cl_device_id device, cl_context context, cl_command_queue queue,
+                int num_elements)
+{
+    try
+    {
+        TypesIterator<false>(device, context, queue, num_elements, "FP2INT");
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}

From 50f9f063236394eea1edfab92bb4ebebd8c33b78 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Wed, 21 Jun 2023 15:19:21 +0100
Subject: [PATCH 08/20] test_common: fix -Wsign-compare warnings (#1759)

In preparation of re-enabling `-Wsign-compare` globally, avoid mixing
signed and unsigned integers in comparisons in test_common.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_common/gl/helpers.cpp          | 4 ++--
 test_common/harness/testHarness.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_common/gl/helpers.cpp b/test_common/gl/helpers.cpp
index b9f95a94a8..1fb85035e5 100644
--- a/test_common/gl/helpers.cpp
+++ b/test_common/gl/helpers.cpp
@@ -1715,7 +1715,7 @@ void * CreateGLRenderbuffer( GLsizei width, GLsizei height,
         // Reverse and reorder to validate since in the
         // kernel the read_imagef() call always returns RGBA
         cl_uchar *p = (cl_uchar *)buffer;
-        for( size_t i = 0; i < (size_t)width * height; i++ )
+        for (GLsizei i = 0; i < width * height; i++)
         {
             cl_uchar uc0 = p[i * 4 + 0];
             cl_uchar uc1 = p[i * 4 + 1];
@@ -1733,7 +1733,7 @@ void * CreateGLRenderbuffer( GLsizei width, GLsizei height,
       // Reverse and reorder to validate since in the
       // kernel the read_imagef() call always returns RGBA
       cl_uchar *p = (cl_uchar *)buffer;
-      for( size_t i = 0; i < width * height; i++ )
+      for (GLsizei i = 0; i < width * height; i++)
       {
         cl_uchar uc0 = p[i * 4 + 0];
         cl_uchar uc1 = p[i * 4 + 1];
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 95ea81631e..3d743e717f 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -835,9 +835,9 @@ void callTestFunctions(test_definition testList[],
         std::vector<std::thread *> threads;
         test_harness_state state = { testList, resultTestList, deviceToUse,
                                      config };
-        for (int i = 0; i < config.numWorkerThreads; i++)
+        for (unsigned i = 0; i < config.numWorkerThreads; i++)
         {
-            log_info("Spawning worker thread %i\n", i);
+            log_info("Spawning worker thread %u\n", i);
             threads.push_back(new std::thread(test_function_runner, &state));
         }
 

From 2e88013b34586c10fb8cc9eb0320e5587ce94785 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 22 Jun 2023 06:08:21 +0100
Subject: [PATCH 09/20] compiler: fix memory leak from unnecessary strdup
 (#1761)

The result of the `strdup` was never freed.  The string duplication
isn't necessary, so remove it.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_conformance/compiler/test_compile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/compiler/test_compile.cpp b/test_conformance/compiler/test_compile.cpp
index f3ee431224..d250bdd477 100644
--- a/test_conformance/compiler/test_compile.cpp
+++ b/test_conformance/compiler/test_compile.cpp
@@ -462,7 +462,7 @@ int test_large_multiple_embedded_headers(cl_context context, cl_device_id device
         header_names[i] = _strdup(buffer);
 
         sprintf(buffer, composite_kernel_extern_template, i);
-        const char* line = _strdup(buffer);
+        const char *line = buffer;
         error = create_single_kernel_helper_create_program(context, &headers[i], 1, &line);
         if( headers[i] == NULL || error != CL_SUCCESS )
         {

From 60f025a7da5ab2456ba41405e9fdf655ce948eac Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 27 Jun 2023 17:40:35 +0200
Subject: [PATCH 10/20] Added cl_half support for test_select (#1617)

* Added cl_half support for test_select (issue #142, select)

* Added corrections due to code review + performance optimization + replaced C object with wrappers

* minor fix

* Corrected use of user event

* Removed unnecessary user event
---
 test_conformance/select/test_select.cpp | 295 ++++-----
 test_conformance/select/test_select.h   |  24 +-
 test_conformance/select/util_select.cpp | 779 ++++++++++++++----------
 3 files changed, 576 insertions(+), 522 deletions(-)

diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp
index b0cda09fd1..8a0567c34b 100644
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -14,11 +14,14 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/typeWrappers.h"
 
 #include <assert.h>
 #include <stdio.h>
 #include <time.h>
 #include <string.h>
+#include <vector>
+
 #if ! defined( _WIN32)
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
@@ -66,6 +69,16 @@ static void printUsage( void );
 #define BUFFER_SIZE (1024*1024)
 #define KPAGESIZE 4096
 
+#define test_error_count(errCode, msg)                                         \
+    {                                                                          \
+        auto errCodeResult = errCode;                                          \
+        if (errCodeResult != CL_SUCCESS)                                       \
+        {                                                                      \
+            gFailCount++;                                                      \
+            print_error(errCodeResult, msg);                                   \
+            return errCode;                                                    \
+        }                                                                      \
+    }
 
 // When we indicate non wimpy mode, the types that are 32 bits value will
 // test their entire range and 64 bits test will test the 32 bit
@@ -74,12 +87,6 @@ static void printUsage( void );
 static bool  s_wimpy_mode = false;
 static int s_wimpy_reduction_factor = 256;
 
-// Tests are broken into the major test which is based on the
-// src and cmp type and their corresponding vector types and
-// sub tests which is for each individual test.  The following
-// tracks the subtests
-int s_test_cnt = 0;
-
 //-----------------------------------------
 // Static helper functions
 //-----------------------------------------
@@ -237,6 +244,9 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont
     if (srctype == kdouble)
         strcpy( extension, "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" );
 
+    if (srctype == khalf)
+        strcpy(extension, "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     // create type name and testname
     switch( vec_len )
     {
@@ -288,25 +298,14 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont
     return program;
 }
 
-
 #define VECTOR_SIZE_COUNT   6
 
 static int doTest(cl_command_queue queue, cl_context context, Type stype, Type cmptype, cl_device_id device)
 {
     int err = CL_SUCCESS;
-    int s_test_fail = 0;
-    MTdataHolder d;
+    MTdataHolder d(gRandomSeed);
     const size_t element_count[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
-    cl_mem src1 = NULL;
-    cl_mem src2 = NULL;
-    cl_mem cmp = NULL;
-    cl_mem dest = NULL;
-    void *ref = NULL;
-    void *sref = NULL;
-    void *src1_host = NULL;
-    void *src2_host = NULL;
-    void *cmp_host = NULL;
-    void *dest_host = NULL;
+    clMemWrapper src1, src2, cmp, dest;
 
     cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
     size_t block_elements = BUFFER_SIZE / type_size[stype];
@@ -315,16 +314,22 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
 
     // It is more efficient to create the tests all at once since we
     // use the same test data on each of the vector sizes
-    int vecsize;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel  kernels[VECTOR_SIZE_COUNT];
+    clProgramWrapper programs[VECTOR_SIZE_COUNT];
+    clKernelWrapper kernels[VECTOR_SIZE_COUNT];
 
-    if(stype == kdouble && ! is_extension_available( device, "cl_khr_fp64" ))
+    if (stype == kdouble && !is_extension_available(device, "cl_khr_fp64"))
     {
         log_info("Skipping double because cl_khr_fp64 extension is not supported.\n");
         return 0;
     }
 
+    if (stype == khalf && !is_extension_available(device, "cl_khr_fp16"))
+    {
+        log_info(
+            "Skipping half because cl_khr_fp16 extension is not supported.\n");
+        return 0;
+    }
+
     if (gIsEmbedded)
     {
        if (( stype == klong || stype == kulong ) && ! is_extension_available( device, "cles_khr_int64" ))
@@ -340,54 +345,41 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
        }
     }
 
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
-    {
-        programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype, cmptype, element_count[vecsize] );
-        if (!programs[vecsize] || !kernels[vecsize]) {
-            ++s_test_fail;
-            ++s_test_cnt;
-            return -1;
-        }
-    }
-
-    ref = malloc( BUFFER_SIZE );
-    if( NULL == ref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
-    sref = malloc( BUFFER_SIZE );
-    if( NULL == sref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
     src1 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src1 buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate src1 buffer\n");
     src2 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src2 buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate src2 buffer\n");
     cmp = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate cmp buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate cmp buffer\n");
     dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate dest buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate dest buffer\n");
 
-    src1_host = malloc(BUFFER_SIZE);
-    if (NULL == src1_host)
-    {
-        log_error("Error: could not allocate src1_host buffer\n");
-        goto exit;
-    }
-    src2_host = malloc(BUFFER_SIZE);
-    if (NULL == src2_host)
-    {
-        log_error("Error: could not allocate src2_host buffer\n");
-        goto exit;
-    }
-    cmp_host = malloc(BUFFER_SIZE);
-    if (NULL == cmp_host)
+    for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
     {
-        log_error("Error: could not allocate cmp_host buffer\n");
-        goto exit;
-    }
-    dest_host = malloc(BUFFER_SIZE);
-    if (NULL == dest_host)
-    {
-        log_error("Error: could not allocate dest_host buffer\n");
-        goto exit;
+        programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype,
+                                              cmptype, element_count[vecsize]);
+        if (!programs[vecsize] || !kernels[vecsize])
+        {
+            return -1;
+        }
+
+        err = clSetKernelArg(kernels[vecsize], 0, sizeof dest, &dest);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 1, sizeof src1, &src1);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 2, sizeof src2, &src2);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 3, sizeof cmp, &cmp);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
     }
 
+    std::vector<char> ref(BUFFER_SIZE);
+    std::vector<char> sref(BUFFER_SIZE);
+    std::vector<char> src1_host(BUFFER_SIZE);
+    std::vector<char> src2_host(BUFFER_SIZE);
+    std::vector<char> cmp_host(BUFFER_SIZE);
+    std::vector<char> dest_host(BUFFER_SIZE);
+
     // We block the test as we are running over the range of compare values
     // "block the test" means "break the test into blocks"
     if( type_size[stype] == 4 )
@@ -396,111 +388,63 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
         cmp_stride = block_elements * step * (0xffffffffffffffffULL / 0x100000000ULL + 1);
 
     log_info("Testing...");
-    d = MTdataHolder(gRandomSeed);
     uint64_t i;
+
     for (i=0; i < blocks; i+=step)
     {
-        void *s1 = clEnqueueMapBuffer( queue, src1, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src1" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s1, stype, d);
-
-        void *s2 = clEnqueueMapBuffer( queue, src2, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src2" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s2, stype, d);
-
-        void *s3 = clEnqueueMapBuffer( queue, cmp, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map cmp" ); goto exit; }
-        // Setup the input data to change for each block
-        initCmpBuffer(s3, cmptype, i * cmp_stride, block_elements);
-
-        if( (err = clEnqueueUnmapMemObject( queue, src1, s1, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src1\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, src2, s2, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src2\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, cmp, s3, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap cmp\n" );  ++s_test_fail; goto exit; }
-
-        // Create the reference result
-        err = clEnqueueReadBuffer(queue, src1, CL_TRUE, 0, BUFFER_SIZE,
-                                  src1_host, 0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from src1 to src1_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
-        err = clEnqueueReadBuffer(queue, src2, CL_TRUE, 0, BUFFER_SIZE,
-                                  src2_host, 0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from src2 to src2_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
-        err = clEnqueueReadBuffer(queue, cmp, CL_TRUE, 0, BUFFER_SIZE, cmp_host,
-                                  0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from cmp to cmp_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
+        initSrcBuffer(src1_host.data(), stype, d);
+        initSrcBuffer(src2_host.data(), stype, d);
+        initCmpBuffer(cmp_host.data(), cmptype, i * cmp_stride, block_elements);
+
+        err = clEnqueueWriteBuffer(queue, src1, CL_FALSE, 0, BUFFER_SIZE,
+                                   src1_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src1");
+
+        err = clEnqueueWriteBuffer(queue, src2, CL_FALSE, 0, BUFFER_SIZE,
+                                   src2_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src2");
+
+        err = clEnqueueWriteBuffer(queue, cmp, CL_FALSE, 0, BUFFER_SIZE,
+                                   cmp_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write cmp");
 
         Select sfunc = (cmptype == ctype[stype][0]) ? vrefSelects[stype][0]
                                                     : vrefSelects[stype][1];
-        (*sfunc)(ref, src1_host, src2_host, cmp_host, block_elements);
+        (*sfunc)(ref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
         sfunc = (cmptype == ctype[stype][0]) ? refSelects[stype][0]
                                              : refSelects[stype][1];
-        (*sfunc)(sref, src1_host, src2_host, cmp_host, block_elements);
+        (*sfunc)(sref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
-        for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
+        for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
         {
             size_t vector_size = element_count[vecsize] * type_size[stype];
             size_t vector_count =  (BUFFER_SIZE + vector_size - 1) / vector_size;
 
-            if((err = clSetKernelArg(kernels[vecsize], 0,  sizeof dest, &dest) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 1,  sizeof src1, &src1) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 2,  sizeof src2, &src2) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 3,  sizeof cmp, &cmp) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-
-            // Wipe destination
-            void *d = clEnqueueMapBuffer( queue, dest, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-            if( err ){ log_error( "Error: Could not map dest" );  ++s_test_fail; goto exit; }
-            memset( d, -1, BUFFER_SIZE );
-            if( (err = clEnqueueUnmapMemObject( queue, dest, d, 0, NULL, NULL ) ) ){ log_error( "Error: Could not unmap dest" ); ++s_test_fail; goto exit; }
+            const cl_int pattern = -1;
+            err = clEnqueueFillBuffer(queue, dest, &pattern, sizeof(cl_int), 0,
+                                      BUFFER_SIZE, 0, nullptr, nullptr);
+            test_error_count(err, "clEnqueueFillBuffer failed");
+
 
             err = clEnqueueNDRangeKernel(queue, kernels[vecsize], 1, NULL, &vector_count, NULL, 0, NULL, NULL);
-            if (err != CL_SUCCESS) {
-                log_error("clEnqueueNDRangeKernel failed errcode:%d\n", err);
-                ++s_test_fail;
-                goto exit;
-            }
+            test_error_count(err, "clEnqueueNDRangeKernel failed errcode\n");
 
             err = clEnqueueReadBuffer(queue, dest, CL_TRUE, 0, BUFFER_SIZE,
-                                      dest_host, 0, NULL, NULL);
-            if (err)
-            {
-                log_error(
-                    "Error: Reading buffer from dest to dest_host failed\n");
-                ++s_test_fail;
-                goto exit;
-            }
+                                      dest_host.data(), 0, NULL, NULL);
+            test_error_count(
+                err, "Error: Reading buffer from dest to dest_host failed\n");
 
-            if ((*checkResults[stype])(dest_host, vecsize == 0 ? sref : ref,
+            if ((*checkResults[stype])(dest_host.data(),
+                                       vecsize == 0 ? sref.data() : ref.data(),
                                        block_elements, element_count[vecsize])
                 != 0)
             {
                 log_error("vec_size:%d indx: 0x%16.16llx\n",
                           (int)element_count[vecsize], i);
-                ++s_test_fail;
-                goto exit;
+                return TEST_FAIL;
             }
         } // for vecsize
     } // for i
@@ -510,28 +454,6 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
     else
         log_info(" Wimpy Passed\n\n");
 
-exit:
-    if( src1 )  clReleaseMemObject( src1 );
-    if( src2 )  clReleaseMemObject( src2 );
-    if( cmp )   clReleaseMemObject( cmp );
-    if( dest)   clReleaseMemObject( dest );
-    if( ref )   free(ref );
-    if( sref )  free(sref );
-    if (src1_host) free(src1_host);
-    if (src2_host) free(src2_host);
-    if (cmp_host) free(cmp_host);
-    if (dest_host) free(dest_host);
-
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; vecsize++) {
-        clReleaseKernel(kernels[vecsize]);
-        clReleaseProgram(programs[vecsize]);
-    }
-    ++s_test_cnt;
-    if (s_test_fail)
-    {
-        err = TEST_FAIL;
-        gFailCount++;
-    }
     return err;
 }
 
@@ -567,6 +489,16 @@ int test_select_short_short(cl_device_id deviceID, cl_context context, cl_comman
 {
     return doTest(queue, context, kshort, kshort, deviceID);
 }
+int test_select_half_ushort(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kushort, deviceID);
+}
+int test_select_half_short(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kshort, deviceID);
+}
 int test_select_uint_uint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return doTest(queue, context, kuint, kuint, deviceID);
@@ -617,26 +549,17 @@ int test_select_double_long(cl_device_id deviceID, cl_context context, cl_comman
 }
 
 test_definition test_list[] = {
-    ADD_TEST( select_uchar_uchar ),
-    ADD_TEST( select_uchar_char ),
-    ADD_TEST( select_char_uchar ),
-    ADD_TEST( select_char_char ),
-    ADD_TEST( select_ushort_ushort ),
-    ADD_TEST( select_ushort_short ),
-    ADD_TEST( select_short_ushort ),
-    ADD_TEST( select_short_short ),
-    ADD_TEST( select_uint_uint ),
-    ADD_TEST( select_uint_int ),
-    ADD_TEST( select_int_uint ),
-    ADD_TEST( select_int_int ),
-    ADD_TEST( select_float_uint ),
-    ADD_TEST( select_float_int ),
-    ADD_TEST( select_ulong_ulong ),
-    ADD_TEST( select_ulong_long ),
-    ADD_TEST( select_long_ulong ),
-    ADD_TEST( select_long_long ),
-    ADD_TEST( select_double_ulong ),
-    ADD_TEST( select_double_long ),
+    ADD_TEST(select_uchar_uchar),   ADD_TEST(select_uchar_char),
+    ADD_TEST(select_char_uchar),    ADD_TEST(select_char_char),
+    ADD_TEST(select_ushort_ushort), ADD_TEST(select_ushort_short),
+    ADD_TEST(select_short_ushort),  ADD_TEST(select_short_short),
+    ADD_TEST(select_half_ushort),   ADD_TEST(select_half_short),
+    ADD_TEST(select_uint_uint),     ADD_TEST(select_uint_int),
+    ADD_TEST(select_int_uint),      ADD_TEST(select_int_int),
+    ADD_TEST(select_float_uint),    ADD_TEST(select_float_int),
+    ADD_TEST(select_ulong_ulong),   ADD_TEST(select_ulong_long),
+    ADD_TEST(select_long_ulong),    ADD_TEST(select_long_long),
+    ADD_TEST(select_double_ulong),  ADD_TEST(select_double_long),
 };
 
 const int test_num = ARRAY_SIZE( test_list );
diff --git a/test_conformance/select/test_select.h b/test_conformance/select/test_select.h
index c51ae13c2c..5cd786022b 100644
--- a/test_conformance/select/test_select.h
+++ b/test_conformance/select/test_select.h
@@ -28,18 +28,20 @@
 #endif
 
 // Defines the set of types we support (no support for double)
-typedef enum {
+typedef enum
+{
     kuchar = 0,
     kchar = 1,
     kushort = 2,
     kshort = 3,
-    kuint = 4,
-    kint = 5,
-    kfloat = 6,
-    kulong = 7,
-    klong = 8,
-    kdouble = 9,
-    kTypeCount  // always goes last
+    khalf = 4,
+    kuint = 5,
+    kint = 6,
+    kfloat = 7,
+    kulong = 8,
+    klong = 9,
+    kdouble = 10,
+    kTypeCount // always goes last
 } Type;
 
 
@@ -56,7 +58,8 @@ extern const size_t type_size[kTypeCount];
 extern const Type ctype[kTypeCount][2];
 
 // Reference functions for the primitive (non vector) type
-typedef void (*Select)(void *dest, void *src1, void *src2, void *cmp, size_t c);
+typedef void (*Select)(void *const dest, const void *const src1,
+                       const void *const src2, const void *const cmp, size_t c);
 extern Select refSelects[kTypeCount][2];
 
 // Reference functions for the primtive type but uses the vector
@@ -64,7 +67,8 @@ extern Select refSelects[kTypeCount][2];
 extern Select vrefSelects[kTypeCount][2];
 
 // Check functions for each output type
-typedef size_t (*CheckResults)(void *out1, void *out2, size_t count, size_t vectorSize);
+typedef size_t (*CheckResults)(const void *const out1, const void *const out2,
+                               size_t count, size_t vectorSize);
 extern CheckResults checkResults[kTypeCount];
 
 // Helpful macros
diff --git a/test_conformance/select/util_select.cpp b/test_conformance/select/util_select.cpp
index f9641e9938..b85f54a762 100644
--- a/test_conformance/select/util_select.cpp
+++ b/test_conformance/select/util_select.cpp
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 #include "harness/errorHelpers.h"
 
 #include <stdio.h>
@@ -25,29 +24,28 @@
 //-----------------------------------------
 
 
-const char *type_name[kTypeCount] = {
-    "uchar", "char",
-    "ushort", "short",
-    "uint",   "int",
-    "float",  "ulong", "long", "double" };
+const char *type_name[kTypeCount] = { "uchar", "char", "ushort", "short",
+                                      "half",  "uint", "int",    "float",
+                                      "ulong", "long", "double" };
 
 const size_t type_size[kTypeCount] = {
-    sizeof(cl_uchar), sizeof(cl_char),
-    sizeof(cl_ushort), sizeof(cl_short),
-    sizeof(cl_uint), sizeof(cl_int),
-    sizeof(cl_float), sizeof(cl_ulong), sizeof(cl_long), sizeof( cl_double ) };
+    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_half),  sizeof(cl_uint), sizeof(cl_int),    sizeof(cl_float),
+    sizeof(cl_ulong), sizeof(cl_long), sizeof(cl_double)
+};
 
 const Type ctype[kTypeCount][2] = {
-    { kuchar,  kchar },     // uchar
-    { kuchar,  kchar },     // char
-    { kushort, kshort},     // ushort
-    { kushort, kshort},     // short
-    { kuint,   kint  },     // uint
-    { kuint,   kint  },     // int
-    { kuint,   kint  },     // float
-    { kulong,  klong },     // ulong
-    { kulong,  klong },     // long
-    { kulong,  klong }     // double
+    { kuchar, kchar }, // uchar
+    { kuchar, kchar }, // char
+    { kushort, kshort }, // ushort
+    { kushort, kshort }, // short
+    { kushort, kshort }, // half
+    { kuint, kint }, // uint
+    { kuint, kint }, // int
+    { kuint, kint }, // float
+    { kulong, klong }, // ulong
+    { kulong, klong }, // long
+    { kulong, klong } // double
 };
 
 
@@ -55,510 +53,594 @@ const Type ctype[kTypeCount][2] = {
 // Reference functions
 //-----------------------------------------
 
-void refselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_short*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*)dest;
-    x = (cl_int*)src1;
-    y = (cl_int*)src2;
-    m = (cl_int*)cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void refselect_1u32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*)dest;
-    x = (cl_uint*)src1;
-    y = (cl_uint*)src2;
-    m = (cl_int*)cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_hhi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_hhu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_int *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_ffi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ffu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_long *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void vrefselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*)src1;
-    y = (cl_ushort*)src2;
-    m = (cl_short*)cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void vrefselect_1u32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_hhi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000) ? y[i] : x[i];
+}
+
+void vrefselect_hhu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
+}
+
+void vrefselect_ffi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ffu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
 // Define refSelects
-Select refSelects[kTypeCount][2] =  {
-    { refselect_1u8u,  refselect_1u8  }, // cl_uchar
-    { refselect_1i8u,  refselect_1i8  }, // char
+Select refSelects[kTypeCount][2] = {
+    { refselect_1u8u, refselect_1u8 }, // cl_uchar
+    { refselect_1i8u, refselect_1i8 }, // char
     { refselect_1u16u, refselect_1u16 }, // ushort
     { refselect_1i16u, refselect_1i16 }, // short
+    { refselect_hhu, refselect_hhi }, // half
     { refselect_1u32u, refselect_1u32 }, // uint
     { refselect_1i32u, refselect_1i32 }, // int
-    { refselect_ffu,   refselect_ffi  }, // float
+    { refselect_ffu, refselect_ffi }, // float
     { refselect_1u64u, refselect_1u64 }, // ulong
     { refselect_1i64u, refselect_1i64 }, // long
-    { refselect_ddu,   refselect_ddi }   // double
+    { refselect_ddu, refselect_ddi } // double
 };
 
 // Define vrefSelects (vector refSelects)
-Select vrefSelects[kTypeCount][2] =  {
-    { vrefselect_1u8u,  vrefselect_1u8  }, // cl_uchar
-    { vrefselect_1i8u,  vrefselect_1i8  }, // char
+Select vrefSelects[kTypeCount][2] = {
+    { vrefselect_1u8u, vrefselect_1u8 }, // cl_uchar
+    { vrefselect_1i8u, vrefselect_1i8 }, // char
     { vrefselect_1u16u, vrefselect_1u16 }, // ushort
     { vrefselect_1i16u, vrefselect_1i16 }, // short
+    { vrefselect_hhu, vrefselect_hhi }, // half
     { vrefselect_1u32u, vrefselect_1u32 }, // uint
     { vrefselect_1i32u, vrefselect_1i32 }, // int
-    { vrefselect_ffu,   vrefselect_ffi  }, // float
+    { vrefselect_ffu, vrefselect_ffi }, // float
     { vrefselect_1u64u, vrefselect_1u64 }, // ulong
     { vrefselect_1i64u, vrefselect_1i64 }, // long
-    { vrefselect_ddu,   vrefselect_ddi  }     // double
+    { vrefselect_ddu, vrefselect_ddi } // double
 };
 
 
 //-----------------------------------------
 // Check functions
 //-----------------------------------------
-size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uchar *t = (const cl_uchar *) test;
-    const cl_uchar *c = (const cl_uchar *) correct;
+size_t check_uchar(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uchar *const t = (const cl_uchar *)test;
+    const cl_uchar *const c = (const cl_uchar *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -576,9 +658,11 @@ size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_char *t = (const cl_char *) test;
-    const cl_char *c = (const cl_char *) correct;
+size_t check_char(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_char *const t = (const cl_char *)test;
+    const cl_char *const c = (const cl_char *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -597,9 +681,11 @@ size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ushort *t = (const cl_ushort *) test;
-    const cl_ushort *c = (const cl_ushort *) correct;
+size_t check_ushort(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -618,9 +704,11 @@ size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_short(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_short *t = (const cl_short *) test;
-    const cl_short *c = (const cl_short *) correct;
+size_t check_short(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_short *const t = (const cl_short *)test;
+    const cl_short *const c = (const cl_short *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -639,9 +727,11 @@ size_t check_short(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_uint(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -660,9 +750,11 @@ size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_int *t = (const cl_int *) test;
-    const cl_int *c = (const cl_int *) correct;
+size_t check_int(const void *const test, const void *const correct,
+                 size_t count, size_t vector_size)
+{
+    const cl_int *const t = (const cl_int *)test;
+    const cl_int *const c = (const cl_int *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -682,9 +774,11 @@ size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_ulong(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -703,9 +797,11 @@ size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_long *t = (const cl_long *) test;
-    const cl_long *c = (const cl_long *) correct;
+size_t check_long(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_long *const t = (const cl_long *)test;
+    const cl_long *const c = (const cl_long *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -724,9 +820,36 @@ size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_float( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_half(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
+    size_t i;
+
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++) /* Allow nans to be binary different */
+            if ((t[i] != c[i])
+                && !(isnan(((cl_half *)correct)[i])
+                     && isnan(((cl_half *)test)[i])))
+            {
+                log_error("\n(check_half) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%4.4x vs 0x%4.4x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
+
+    return 0;
+}
+
+size_t check_float(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -746,9 +869,11 @@ size_t check_float( void *test, void *correct, size_t count, size_t vector_size
     return 0;
 }
 
-size_t check_double( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_double(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -770,5 +895,7 @@ size_t check_double( void *test, void *correct, size_t count, size_t vector_size
 }
 
 CheckResults checkResults[kTypeCount] = {
-    check_uchar, check_char, check_ushort, check_short, check_uint,
-    check_int, check_float, check_ulong, check_long, check_double };
+    check_uchar, check_char, check_ushort, check_short,
+    check_half,  check_uint, check_int,    check_float,
+    check_ulong, check_long, check_double
+};

From 2495eca9fa89fcfadb3bcca7fda61b9f20b1f4e3 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 27 Jun 2023 17:42:02 +0200
Subject: [PATCH 11/20] Added cl_khr_fp16 extension support for test_commonfns
 (#1695)

* Added cl_khr_fp16 extension support for commonfns test (issue #142, commonfns)

* Added missing header due to presubmit check

* Corrected radians/degrees ulp calculations + cosmetic fixes

* Corrected presubmit code format

* Corrections related to code review

* Moved string format helper to test_common in separate header

* Added clang format for last commit

* Corrections related to code review

* Modified mix verification procedure for half type to only report max error

* Removed redundant condition for logging mix verification

* Corrected generator limits for half tests
---
 test_common/harness/stringHelpers.h           |   6 +-
 test_conformance/commonfns/main.cpp           |  43 +++--
 test_conformance/commonfns/test_base.h        |  86 +++++++---
 test_conformance/commonfns/test_binary_fn.cpp |  74 +++++++--
 test_conformance/commonfns/test_clamp.cpp     |  83 ++++++++--
 test_conformance/commonfns/test_mix.cpp       | 120 +++++++++-----
 .../commonfns/test_smoothstep.cpp             | 123 +++++++++-----
 test_conformance/commonfns/test_step.cpp      |  61 ++++---
 test_conformance/commonfns/test_unary_fn.cpp  | 155 +++++++++++-------
 .../relationals/test_comparisons_fp.cpp       |  33 +---
 10 files changed, 526 insertions(+), 258 deletions(-)

diff --git a/test_common/harness/stringHelpers.h b/test_common/harness/stringHelpers.h
index 3f6bf64db4..a02624d6da 100644
--- a/test_common/harness/stringHelpers.h
+++ b/test_common/harness/stringHelpers.h
@@ -14,8 +14,8 @@
 // limitations under the License.
 //
 
-#ifndef BASIC_UTILS_H
-#define BASIC_UTILS_H
+#ifndef STRING_HELPERS_H
+#define STRING_HELPERS_H
 
 #include <memory>
 #include <string>
@@ -38,4 +38,4 @@ inline std::string str_sprintf(const std::string &str, Args... args)
     return std::string(buffer.get(), buffer.get() + s - 1);
 }
 
-#endif // BASIC_UTIL_H
+#endif // STRING_HELPERS_H
diff --git a/test_conformance/commonfns/main.cpp b/test_conformance/commonfns/main.cpp
index 3e4b0b8e76..645d3f703c 100644
--- a/test_conformance/commonfns/main.cpp
+++ b/test_conformance/commonfns/main.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,8 +18,10 @@
 #include <string.h>
 #include "procs.h"
 #include "test_base.h"
+#include "harness/kernelHelpers.h"
 
 std::map<size_t, std::string> BaseFunctionTest::type2name;
+cl_half_rounding_mode BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
 
 int g_arrVecSizes[kVectorSizeCount + kStrangeVectorSizeCount];
 int g_arrStrangeVectorSizes[kStrangeVectorSizeCount] = {3};
@@ -45,17 +47,38 @@ test_definition test_list[] = {
 
 const int test_num = ARRAY_SIZE( test_list );
 
-int main(int argc, const char *argv[])
+test_status InitCL(cl_device_id device)
 {
-    initVecSizes();
-
-    if (BaseFunctionTest::type2name.empty())
+    if (is_extension_available(device, "cl_khr_fp16"))
     {
-        BaseFunctionTest::type2name[sizeof(half)] = "half";
-        BaseFunctionTest::type2name[sizeof(float)] = "float";
-        BaseFunctionTest::type2name[sizeof(double)] = "double";
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
     }
 
-    return runTestHarness(argc, argv, test_num, test_list, false, 0);
+    return TEST_PASS;
 }
 
+int main(int argc, const char *argv[])
+{
+    initVecSizes();
+
+    BaseFunctionTest::type2name[sizeof(half)] = "half";
+    BaseFunctionTest::type2name[sizeof(float)] = "float";
+    BaseFunctionTest::type2name[sizeof(double)] = "double";
+
+    return runTestHarnessWithCheck(argc, argv, test_num, test_list, false, 0,
+                                   InitCL);
+}
diff --git a/test_conformance/commonfns/test_base.h b/test_conformance/commonfns/test_base.h
index 4429104263..be36ed264b 100644
--- a/test_conformance/commonfns/test_base.h
+++ b/test_conformance/commonfns/test_base.h
@@ -19,27 +19,23 @@
 #include <vector>
 #include <map>
 #include <memory>
+#include <cmath>
 
 #include <CL/cl_half.h>
 #include <CL/cl_ext.h>
 
-#include "harness/deviceInfo.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 
-
 template <typename T>
 using VerifyFuncBinary = int (*)(const T *const, const T *const, const T *const,
                                  const int num, const int vs, const int vp);
 
-
 template <typename T>
 using VerifyFuncUnary = int (*)(const T *const, const T *const, const int num);
 
-
 using half = cl_half;
 
-
 struct BaseFunctionTest
 {
     BaseFunctionTest(cl_device_id device, cl_context context,
@@ -61,9 +57,9 @@ struct BaseFunctionTest
     bool vecParam;
 
     static std::map<size_t, std::string> type2name;
+    static cl_half_rounding_mode halfRoundingMode;
 };
 
-
 struct MinTest : BaseFunctionTest
 {
     MinTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -74,7 +70,6 @@ struct MinTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct MaxTest : BaseFunctionTest
 {
     MaxTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -85,7 +80,6 @@ struct MaxTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct ClampTest : BaseFunctionTest
 {
     ClampTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -96,7 +90,6 @@ struct ClampTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct DegreesTest : BaseFunctionTest
 {
     DegreesTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -107,7 +100,6 @@ struct DegreesTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct RadiansTest : BaseFunctionTest
 {
     RadiansTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -118,7 +110,6 @@ struct RadiansTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct SignTest : BaseFunctionTest
 {
     SignTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -129,7 +120,6 @@ struct SignTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct SmoothstepTest : BaseFunctionTest
 {
     SmoothstepTest(cl_device_id device, cl_context context,
@@ -141,7 +131,6 @@ struct SmoothstepTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct StepTest : BaseFunctionTest
 {
     StepTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -152,7 +141,6 @@ struct StepTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct MixTest : BaseFunctionTest
 {
     MixTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -163,19 +151,71 @@ struct MixTest : BaseFunctionTest
     cl_int Run() override;
 };
 
+template <typename T> float UlpFn(const T &val, const double &r)
+{
+    if (std::is_same<T, half>::value)
+    {
+        return Ulp_Error_Half(val, r);
+    }
+    else if (std::is_same<T, float>::value)
+    {
+        return Ulp_Error(val, r);
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        return Ulp_Error_Double(val, r);
+    }
+    else
+    {
+        log_error("UlpFn: unsupported data type\n");
+    }
+
+    return -1.f; // wrong val
+}
+
+template <typename T> inline double conv_to_dbl(const T &val)
+{
+    if (std::is_same<T, half>::value)
+        return (double)cl_half_to_float(val);
+    else
+        return (double)val;
+}
 
-template <typename... Args>
-std::string string_format(const std::string &format, Args... args)
+template <typename T> inline double conv_to_flt(const T &val)
 {
-    int sformat = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;
-    if (sformat <= 0)
-        throw std::runtime_error("string_format: string processing error.");
-    auto format_size = static_cast<size_t>(sformat);
-    std::unique_ptr<char[]> buffer(new char[format_size]);
-    std::snprintf(buffer.get(), format_size, format.c_str(), args...);
-    return std::string(buffer.get(), buffer.get() + format_size - 1);
+    if (std::is_same<T, half>::value)
+        return (float)cl_half_to_float(val);
+    else
+        return (float)val;
 }
 
+template <typename T> inline half conv_to_half(const T &val)
+{
+    if (std::is_floating_point<T>::value)
+        return cl_half_from_float(val, BaseFunctionTest::halfRoundingMode);
+    return 0;
+}
+
+template <typename T> bool isfinite_fp(const T &v)
+{
+    if (std::is_same<T, half>::value)
+    {
+        // Extract FP16 exponent and mantissa
+        uint16_t h_exp = (((half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+        uint16_t h_mant = ((half)v) & 0x3FF;
+
+        // !Inf test
+        return !(h_exp == 0x1F && h_mant == 0);
+    }
+    else
+    {
+#if !defined(_WIN32)
+        return std::isfinite(v);
+#else
+        return isfinite(v);
+#endif
+    }
+}
 
 template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
diff --git a/test_conformance/commonfns/test_binary_fn.cpp b/test_conformance/commonfns/test_binary_fn.cpp
index 1eb12f730f..a6c75647d0 100644
--- a/test_conformance/commonfns/test_binary_fn.cpp
+++ b/test_conformance/commonfns/test_binary_fn.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 
 #include "harness/deviceInfo.h"
 #include "harness/typeWrappers.h"
+#include "harness/stringHelpers.h"
 
 #include "procs.h"
 #include "test_base.h"
@@ -53,7 +54,6 @@ const char *binary_fn_code_pattern_v3_scalar =
 "    vstore3(%s(vload3(tid,x), y[tid] ), tid, dst);\n"
 "}\n";
 
-
 template <typename T>
 int test_binary_fn(cl_device_id device, cl_context context,
                    cl_command_queue queue, int n_elems,
@@ -105,6 +105,16 @@ int test_binary_fn(cl_device_id device, cl_context context,
             input_ptr[1][j] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -125,22 +135,22 @@ int test_binary_fn(cl_device_id device, cl_context context,
             {
                 std::string str = binary_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
             else
             {
                 std::string str = binary_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
         }
         else
         {
             // do regular
             std::string str = binary_fn_code_pattern;
-            kernelSource = string_format(
+            kernelSource = str_sprintf(
                 str, pragma_str.c_str(), tname.c_str(), vecSizeNames[i],
                 tname.c_str(), vecSecParam ? vecSizeNames[i] : "",
                 tname.c_str(), vecSizeNames[i], fnName.c_str());
@@ -203,13 +213,20 @@ int max_verify(const T* const x, const T* const y, const T* const out,
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] < y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) < conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -227,13 +244,20 @@ int min_verify(const T* const x, const T* const y, const T* const out,
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] > y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) > conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -246,6 +270,13 @@ int min_verify(const T* const x, const T* const y, const T* const out,
 cl_int MaxTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        max_verify<cl_half>);
+        test_error(error, "MaxTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, max_verify<float>);
@@ -265,6 +296,13 @@ cl_int MaxTest::Run()
 cl_int MinTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        min_verify<cl_half>);
+        test_error(error, "MinTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, min_verify<float>);
diff --git a/test_conformance/commonfns/test_clamp.cpp b/test_conformance/commonfns/test_clamp.cpp
index 0e96fb6027..1bf4067705 100644
--- a/test_conformance/commonfns/test_clamp.cpp
+++ b/test_conformance/commonfns/test_clamp.cpp
@@ -26,12 +26,10 @@
 #include "procs.h"
 #include "test_base.h"
 
-
 #ifndef M_PI
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 #define CLAMP_KERNEL(type)                                                     \
     const char *clamp_##type##_kernel_code = EMIT_PRAGMA_DIRECTIVE             \
         "__kernel void test_clamp(__global " #type " *x, __global " #type      \
@@ -64,6 +62,14 @@
         "vload3(tid,maxval)), tid, dst);\n"                                    \
         "}\n";
 
+#define EMIT_PRAGMA_DIRECTIVE "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+CLAMP_KERNEL(half)
+CLAMP_KERNEL_V(half, 2)
+CLAMP_KERNEL_V(half, 4)
+CLAMP_KERNEL_V(half, 8)
+CLAMP_KERNEL_V(half, 16)
+CLAMP_KERNEL_V3(half, 3)
+#undef EMIT_PRAGMA_DIRECTIVE
 
 #define EMIT_PRAGMA_DIRECTIVE " "
 CLAMP_KERNEL(float)
@@ -83,6 +89,10 @@ CLAMP_KERNEL_V(double, 16)
 CLAMP_KERNEL_V3(double, 3)
 #undef EMIT_PRAGMA_DIRECTIVE
 
+const char *clamp_half_codes[] = {
+    clamp_half_kernel_code,  clamp_half2_kernel_code,  clamp_half4_kernel_code,
+    clamp_half8_kernel_code, clamp_half16_kernel_code, clamp_half3_kernel_code
+};
 const char *clamp_float_codes[] = {
     clamp_float_kernel_code,   clamp_float2_kernel_code,
     clamp_float4_kernel_code,  clamp_float8_kernel_code,
@@ -96,21 +106,42 @@ const char *clamp_double_codes[] = {
 
 namespace {
 
-
 template <typename T>
 int verify_clamp(const T *const x, const T *const minval, const T *const maxval,
                  const T *const outptr, int n)
 {
-    T t;
-    for (int i = 0; i < n; i++)
+    if (std::is_same<T, half>::value)
+    {
+        float t;
+        for (int i = 0; i < n; i++)
+        {
+            t = std::min(
+                std::max(cl_half_to_float(x[i]), cl_half_to_float(minval[i])),
+                cl_half_to_float(maxval[i]));
+            if (t != cl_half_to_float(outptr[i]))
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, cl_half_to_float(x[i]), cl_half_to_float(minval[i]),
+                    cl_half_to_float(maxval[i]), t,
+                    cl_half_to_float(outptr[i]));
+                return -1;
+            }
+        }
+    }
+    else
     {
-        t = std::min(std::max(x[i], minval[i]), maxval[i]);
-        if (t != outptr[i])
+        T t;
+        for (int i = 0; i < n; i++)
         {
-            log_error(
-                "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n", i,
-                x[i], minval[i], maxval[i], t, outptr[i]);
-            return -1;
+            t = std::min(std::max(x[i], minval[i]), maxval[i]);
+            if (t != outptr[i])
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, x[i], minval[i], maxval[i], t, outptr[i]);
+                return -1;
+            }
         }
     }
 
@@ -118,7 +149,6 @@ int verify_clamp(const T *const x, const T *const minval, const T *const maxval,
 }
 }
 
-
 template <typename T>
 int test_clamp_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems)
@@ -169,6 +199,17 @@ int test_clamp_fn(cl_device_id device, cl_context context,
             input_ptr[2][j] = get_random_double(input_ptr[1][j], 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        for (j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[2][j] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[1][j]), fval, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -194,9 +235,16 @@ int test_clamp_fn(cl_device_id device, cl_context context,
                 "test_clamp");
             test_error(err, "Unable to create kernel");
         }
+        else if (std::is_same<T, half>::value)
+        {
+            err = create_single_kernel_helper(
+                context, &programs[i], &kernels[i], 1, &clamp_half_codes[i],
+                "test_clamp");
+            test_error(err, "Unable to create kernel");
+        }
 
-        log_info("Just made a program for float, i=%d, size=%d, in slot %d\n",
-                 i, g_arrVecSizes[i], i);
+        log_info("Just made a program for %s, i=%d, size=%d, in slot %d\n",
+                 tname.c_str(), i, g_arrVecSizes[i], i);
         fflush(stdout);
 
         for (j = 0; j < 4; j++)
@@ -239,10 +287,14 @@ int test_clamp_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int ClampTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_clamp_fn<cl_half>(device, context, queue, num_elems);
+        test_error(error, "ClampTest::Run<cl_half> failed");
+    }
 
     error = test_clamp_fn<float>(device, context, queue, num_elems);
     test_error(error, "ClampTest::Run<float> failed");
@@ -256,7 +308,6 @@ cl_int ClampTest::Run()
     return error;
 }
 
-
 int test_clamp(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_mix.cpp b/test_conformance/commonfns/test_mix.cpp
index 92c1010050..2a06e43df6 100644
--- a/test_conformance/commonfns/test_mix.cpp
+++ b/test_conformance/commonfns/test_mix.cpp
@@ -18,6 +18,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
@@ -52,33 +54,42 @@ const char *mix_fn_code_pattern_v3_scalar =
     "    vstore3(mix(vload3(tid, x), vload3(tid, y), a[tid]), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR 1e-3
 
 namespace {
 
-
 template <typename T>
 int verify_mix(const T *const inptrX, const T *const inptrY,
                const T *const inptrA, const T *const outptr, const int n,
                const int veclen, const bool vecParam)
 {
-    T r;
-    float delta = 0.0f;
+    double r, o;
+    float delta = 0.f, max_delta = 0.f;
     int i;
 
     if (vecParam)
     {
         for (i = 0; i < n * veclen; i++)
         {
-            r = inptrX[i] + ((inptrY[i] - inptrX[i]) * inptrA[i]);
-            delta = fabs(double(r - outptr[i])) / r;
-            if (delta > MAX_ERR)
+            r = conv_to_dbl(inptrX[i])
+                + ((conv_to_dbl(inptrY[i]) - conv_to_dbl(inptrX[i]))
+                   * conv_to_dbl(inptrA[i]));
+
+            o = conv_to_dbl(outptr[i]);
+            delta = fabs(double(r - o)) / r;
+            if (!std::is_same<T, half>::value)
+            {
+                if (delta > MAX_ERR)
+                {
+                    log_error("%d) verification error: mix(%a, %a, %a) = *%a "
+                              "vs. %a\n",
+                              i, inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
+                    return -1;
+                }
+            }
+            else
             {
-                log_error(
-                    "%d) verification error: mix(%a, %a, %a) = *%a vs. %a\n", i,
-                    inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
-                return -1;
+                max_delta = std::max(max_delta, delta);
             }
         }
     }
@@ -90,25 +101,40 @@ int verify_mix(const T *const inptrX, const T *const inptrY,
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                r = inptrX[vi] + ((inptrY[vi] - inptrX[vi]) * inptrA[i]);
-                delta = fabs(double(r - outptr[vi])) / r;
-                if (delta > MAX_ERR)
+                r = conv_to_dbl(inptrX[vi])
+                    + ((conv_to_dbl(inptrY[vi]) - conv_to_dbl(inptrX[vi]))
+                       * conv_to_dbl(inptrA[i]));
+                delta = fabs(double(r - conv_to_dbl(outptr[vi]))) / r;
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: mix(%a, "
-                              "%a, %a) = *%a vs. %a\n",
-                              ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
-                              outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error(
+                            "{%d, element %d}) verification error: mix(%a, "
+                            "%a, %a) = *%a vs. %a\n",
+                            ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
+                            outptr[vi]);
+                        return -1;
+                    }
+                }
+                else
+                {
+                    max_delta = std::max(max_delta, delta);
                 }
             }
         }
     }
 
+    // due to the fact that accuracy of mix for cl_khr_fp16 is implementation
+    // defined this test only reports maximum error without testing maximum
+    // error threshold
+    if (std::is_same<T, half>::value)
+        log_error("mix half verification result, max delta: %a\n", max_delta);
+
     return 0;
 }
 } // namespace
 
-
 template <typename T>
 int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
                 int n_elems, bool vecParam)
@@ -120,7 +146,7 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     std::vector<clKernelWrapper> kernels;
 
     int err, i;
-    MTdataHolder d = MTdataHolder(gRandomSeed);
+    MTdataHolder d(gRandomSeed);
 
     assert(BaseFunctionTest::type2name.find(sizeof(T))
            != BaseFunctionTest::type2name.end());
@@ -142,19 +168,32 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
         test_error(err, "clCreateBuffer failed");
     }
 
-    for (i = 0; i < num_elements; i++)
-    {
-        input_ptr[0][i] = (T)genrand_real1(d);
-        input_ptr[1][i] = (T)genrand_real1(d);
-        input_ptr[2][i] = (T)genrand_real1(d);
-    }
-
     std::string pragma_str;
     if (std::is_same<T, double>::value)
     {
         pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
     }
 
+    if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[1][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[2][i] = conv_to_half((float)genrand_real1(d));
+        }
+    }
+    else
+    {
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = (T)genrand_real1(d);
+            input_ptr[1][i] = (T)genrand_real1(d);
+            input_ptr[2][i] = (T)genrand_real1(d);
+        }
+    }
+
     for (i = 0; i < 3; i++)
     {
         err = clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0,
@@ -164,7 +203,6 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     }
 
     char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
-
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
@@ -174,15 +212,15 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
             {
                 std::string str = mix_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = mix_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -190,10 +228,10 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
             // regular path
             std::string str = mix_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i],
-                              tname.c_str(), vecParam ? vecSizeNames[i] : "",
-                              tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i],
+                            tname.c_str(), vecParam ? vecSizeNames[i] : "",
+                            tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -242,10 +280,14 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     return err;
 }
 
-
 cl_int MixTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_mix_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "MixTest::Run<cl_half> failed");
+    }
 
     error = test_mix_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "MixTest::Run<float> failed");
@@ -260,7 +302,6 @@ cl_int MixTest::Run()
     return error;
 }
 
-
 int test_mix(cl_device_id device, cl_context context, cl_command_queue queue,
              int n_elems)
 {
@@ -268,7 +309,6 @@ int test_mix(cl_device_id device, cl_context context, cl_command_queue queue,
                                    true);
 }
 
-
 int test_mixf(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/commonfns/test_smoothstep.cpp b/test_conformance/commonfns/test_smoothstep.cpp
index 31948d3fe1..5afc2d0f22 100644
--- a/test_conformance/commonfns/test_smoothstep.cpp
+++ b/test_conformance/commonfns/test_smoothstep.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *smoothstep_fn_code_pattern =
     "%s\n" /* optional pragma */
     "__kernel void test_fn(__global %s%s *e0, __global %s%s *e1, __global %s%s "
@@ -53,38 +54,43 @@ const char *smoothstep_fn_code_pattern_v3_scalar =
     "    vstore3(smoothstep(e0[tid], e1[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR (1e-5f)
 
 namespace {
 
-
 template <typename T>
 int verify_smoothstep(const T *const edge0, const T *const edge1,
                       const T *const x, const T *const outptr, const int n,
                       const int veclen, const bool vecParam)
 {
-    T r, t;
-    float delta = 0;
+    double r, t;
+    float delta = 0, max_delta = 0;
 
     if (vecParam)
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            t = (x[i] - edge0[i]) / (edge1[i] - edge0[i]);
-            if (t < 0.0f)
-                t = 0.0f;
-            else if (t > 1.0f)
-                t = 1.0f;
-            r = t * t * (3.0f - 2.0f * t);
-            delta = (float)fabs(r - outptr[i]);
-            if (delta > MAX_ERR)
+            t = (conv_to_dbl(x[i]) - conv_to_dbl(edge0[i]))
+                / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+            if (t < 0.0)
+                t = 0.0;
+            else if (t > 1.0)
+                t = 1.0;
+            r = t * t * (3.0 - 2.0 * t);
+            delta = (float)fabs(r - conv_to_dbl(outptr[i]));
+            if (!std::is_same<T, half>::value)
             {
-                log_error("%d) verification error: smoothstep(%a, %a, %a) = "
-                          "*%a vs. %a\n",
-                          i, x[i], edge0[i], edge1[i], r, outptr[i]);
-                return -1;
+                if (delta > MAX_ERR)
+                {
+                    log_error(
+                        "%d) verification error: smoothstep(%a, %a, %a) = "
+                        "*%a vs. %a\n",
+                        i, x[i], edge0[i], edge1[i], r, outptr[i]);
+                    return -1;
+                }
             }
+            else
+                max_delta = std::max(max_delta, delta);
         }
     }
     else
@@ -95,32 +101,48 @@ int verify_smoothstep(const T *const edge0, const T *const edge1,
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                t = (x[vi] - edge0[i]) / (edge1[i] - edge0[i]);
-                if (t < 0.0f)
-                    t = 0.0f;
-                else if (t > 1.0f)
-                    t = 1.0f;
-                r = t * t * (3.0f - 2.0f * t);
-                delta = (float)fabs(r - outptr[vi]);
-                if (delta > MAX_ERR)
+                t = (conv_to_dbl(x[vi]) - conv_to_dbl(edge0[i]))
+                    / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+                if (t < 0.0)
+                    t = 0.0;
+                else if (t > 1.0)
+                    t = 1.0;
+                r = t * t * (3.0 - 2.0 * t);
+                delta = (float)fabs(r - conv_to_dbl(outptr[vi]));
+
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: "
-                              "smoothstep(%a, %a, %a) = *%a vs. %a\n",
-                              ii, j, x[vi], edge0[i], edge1[i], r, outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error("{%d, element %d}) verification error: "
+                                  "smoothstep(%a, %a, %a) = *%a vs. %a\n",
+                                  ii, j, x[vi], edge0[i], edge1[i], r,
+                                  outptr[vi]);
+                        return -1;
+                    }
                 }
+                else
+                    max_delta = std::max(max_delta, delta);
             }
         }
     }
+
+    // due to the fact that accuracy of smoothstep for cl_khr_fp16 is
+    // implementation defined this test only reports maximum error without
+    // testing maximum error threshold
+    if (std::is_same<T, half>::value)
+        log_error("smoothstep half verification result, max delta: %a\n",
+                  max_delta);
+
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_smoothstep_fn(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int n_elems, bool vecParam)
+                       cl_command_queue queue, const int n_elems,
+                       const bool vecParam)
 {
     clMemWrapper streams[4];
     std::vector<T> input_ptr[3], output_ptr;
@@ -170,6 +192,17 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             input_ptr[2][i] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-65503, 65503, d));
+            input_ptr[1][i] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[0][i]), 65503, d));
+            input_ptr[2][i] = conv_to_half(get_random_float(-65503, 65503, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -179,7 +212,7 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
         test_error(err, "Unable to write input buffer");
     }
 
-    char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+    const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
     for (i = 0; i < kTotalVecCount; i++)
     {
@@ -190,15 +223,15 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             {
                 std::string str = smoothstep_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = smoothstep_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -206,11 +239,12 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             // regular path
             std::string str = smoothstep_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
+
         const char *programPtr = kernelSource.c_str();
         err =
             create_single_kernel_helper(context, &programs[i], &kernels[i], 1,
@@ -259,10 +293,15 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int SmoothstepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_smoothstep_fn<half>(device, context, queue, num_elems,
+                                         vecParam);
+        test_error(error, "SmoothstepTest::Run<cl_half> failed");
+    }
 
     error =
         test_smoothstep_fn<float>(device, context, queue, num_elems, vecParam);
@@ -278,7 +317,6 @@ cl_int SmoothstepTest::Run()
     return error;
 }
 
-
 int test_smoothstep(cl_device_id device, cl_context context,
                     cl_command_queue queue, int n_elems)
 {
@@ -286,7 +324,6 @@ int test_smoothstep(cl_device_id device, cl_context context,
                                           "smoothstep", true);
 }
 
-
 int test_smoothstepf(cl_device_id device, cl_context context,
                      cl_command_queue queue, int n_elems)
 {
diff --git a/test_conformance/commonfns/test_step.cpp b/test_conformance/commonfns/test_step.cpp
index dc91766e90..1cfa96eabd 100644
--- a/test_conformance/commonfns/test_step.cpp
+++ b/test_conformance/commonfns/test_step.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *step_fn_code_pattern = "%s\n" /* optional pragma */
                                    "__kernel void test_fn(__global %s%s *edge, "
                                    "__global %s%s *x, __global %s%s *dst)\n"
@@ -48,7 +49,6 @@ const char *step_fn_code_pattern_v3_scalar =
     "    vstore3(step(edge[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 namespace {
 
 template <typename T>
@@ -62,8 +62,8 @@ int verify_step(const T *const inptrA, const T *const inptrB,
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            r = (inptrB[i] < inptrA[i]) ? 0.0 : 1.0;
-            if (r != outptr[i]) return -1;
+            r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[i])) ? 0.0 : 1.0;
+            if (r != conv_to_dbl(outptr[i])) return -1;
         }
     }
     else
@@ -73,24 +73,31 @@ int verify_step(const T *const inptrA, const T *const inptrB,
             int ii = i / veclen;
             for (int j = 0; j < veclen && i < n; ++j, ++i)
             {
-                r = (inptrB[i] < inptrA[ii]) ? 0.0f : 1.0f;
-                if (r != outptr[i])
+                r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[ii])) ? 0.0f
+                                                                       : 1.0f;
+                if (r != conv_to_dbl(outptr[i]))
                 {
-                    log_error("Failure @ {%d, element %d}: step(%a,%a) -> *%a "
-                              "vs %a\n",
-                              ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
+                    if (std::is_same<T, half>::value)
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, conv_to_flt(inptrA[ii]),
+                            conv_to_flt(inptrB[i]), r, conv_to_flt(outptr[i]));
+                    else
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
                     return -1;
                 }
             }
         }
     }
-
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_step_fn(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems, bool vecParam)
@@ -140,6 +147,16 @@ int test_step_fn(cl_device_id device, cl_context context,
             input_ptr[1][i] = get_random_double(-0x40000000, 0x40000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][i] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -160,15 +177,15 @@ int test_step_fn(cl_device_id device, cl_context context,
             {
                 std::string str = step_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = step_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
         }
         else
@@ -176,9 +193,9 @@ int test_step_fn(cl_device_id device, cl_context context,
             // regular path
             std::string str = step_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -229,10 +246,14 @@ int test_step_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int StepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_step_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "StepTest::Run<cl_half> failed");
+    }
 
     error = test_step_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "StepTest::Run<float> failed");
@@ -247,7 +268,6 @@ cl_int StepTest::Run()
     return error;
 }
 
-
 int test_step(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
@@ -255,7 +275,6 @@ int test_step(cl_device_id device, cl_context context, cl_command_queue queue,
                                     true);
 }
 
-
 int test_stepf(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_unary_fn.cpp b/test_conformance/commonfns/test_unary_fn.cpp
index fed4389d9c..91b5c215bf 100644
--- a/test_conformance/commonfns/test_unary_fn.cpp
+++ b/test_conformance/commonfns/test_unary_fn.cpp
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "harness/deviceInfo.h"
+#include "harness/stringHelpers.h"
 #include "harness/typeWrappers.h"
 
 #include "procs.h"
@@ -30,7 +31,6 @@
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 // clang-format off
 const char *unary_fn_code_pattern =
 "%s\n" /* optional pragma */
@@ -51,23 +51,10 @@ const char *unary_fn_code_pattern_v3 =
 "}\n";
 // clang-format on
 
-
 #define MAX_ERR 2.0f
 
 namespace {
 
-
-template <typename T> float UlpFn(const T &val, const double &r)
-{
-    if (std::is_same<T, double>::value)
-        return Ulp_Error_Double(val, r);
-    else if (std::is_same<T, float>::value)
-        return Ulp_Error(val, r);
-    else if (std::is_same<T, half>::value)
-        return Ulp_Error(val, r);
-}
-
-
 template <typename T>
 int verify_degrees(const T *const inptr, const T *const outptr, int n)
 {
@@ -77,7 +64,11 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (180.0 / M_PI) * inptr[i];
+        r = (180.0 / M_PI) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
 
         error = UlpFn(outptr[i], r);
 
@@ -88,21 +79,32 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_radians(const T *const inptr, const T *const outptr, int n)
 {
@@ -112,8 +114,14 @@ int verify_radians(const T *const inptr, const T *const outptr, int n)
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (M_PI / 180.0) * inptr[i];
-        error = Ulp_Error(outptr[i], r);
+        r = (M_PI / 180.0) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
+
+        error = UlpFn(outptr[i], r);
+
         if (fabsf(error) > max_error)
         {
             max_error = error;
@@ -121,41 +129,51 @@ int verify_radians(const T *const inptr, const T *const outptr, int n)
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_sign(const T *const inptr, const T *const outptr, int n)
 {
-    T r = 0;
+    double r = 0;
     for (int i = 0; i < n; i++)
     {
-        if (inptr[i] > 0.0f)
+        if (conv_to_dbl(inptr[i]) > 0.0f)
             r = 1.0;
-        else if (inptr[i] < 0.0f)
+        else if (conv_to_dbl(inptr[i]) < 0.0f)
             r = -1.0;
         else
             r = 0.0;
-        if (r != outptr[i]) return -1;
+        if (r != conv_to_dbl(outptr[i])) return -1;
     }
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_unary_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems,
@@ -207,33 +225,38 @@ int test_unary_fn(cl_device_id device, cl_context context,
                 get_random_double(-100000.0 * M_PI, 100000.0 * M_PI, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[j] = conv_to_half(get_random_float(
+                (float)(-10000.f * M_PI), (float)(10000.f * M_PI), d));
+        }
+    }
 
     err = clEnqueueWriteBuffer(queue, streams[0], true, 0,
                                sizeof(T) * num_elements, &input_ptr.front(), 0,
                                NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueWriteBuffer failed\n");
-        return -1;
-    }
+    test_error(err, "clEnqueueWriteBuffer failed\n");
 
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
-        char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+        const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
         if (i >= kVectorSizeCount)
         {
             std::string str = unary_fn_code_pattern_v3;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         tname.c_str(), fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       tname.c_str(), fnName.c_str());
         }
         else
         {
             std::string str = unary_fn_code_pattern;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         vecSizeNames[i], tname.c_str(),
-                                         vecSizeNames[i], fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       vecSizeNames[i], tname.c_str(),
+                                       vecSizeNames[i], fnName.c_str());
         }
 
         /* Create kernels */
@@ -290,11 +313,18 @@ int test_unary_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int DegreesTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_degrees<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_degrees<half>);
+        test_error(error, "DegreesTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_degrees<float>);
     test_error(error, "DegreesTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -307,11 +337,18 @@ cl_int DegreesTest::Run()
     return error;
 }
 
-
 cl_int RadiansTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_radians<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_radians<half>);
+        test_error(error, "RadiansTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_radians<float>);
     test_error(error, "RadiansTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -324,11 +361,18 @@ cl_int RadiansTest::Run()
     return error;
 }
 
-
 cl_int SignTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_sign<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_sign<half>);
+        test_error(error, "SignTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_sign<float>);
     test_error(error, "SignTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -341,7 +385,6 @@ cl_int SignTest::Run()
     return error;
 }
 
-
 int test_degrees(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -349,7 +392,6 @@ int test_degrees(cl_device_id device, cl_context context,
                                        "degrees");
 }
 
-
 int test_radians(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -357,7 +399,6 @@ int test_radians(cl_device_id device, cl_context context,
                                        "radians");
 }
 
-
 int test_sign(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/relationals/test_comparisons_fp.cpp b/test_conformance/relationals/test_comparisons_fp.cpp
index c3d8f67a37..73ff3dd9ed 100644
--- a/test_conformance/relationals/test_comparisons_fp.cpp
+++ b/test_conformance/relationals/test_comparisons_fp.cpp
@@ -22,6 +22,8 @@
 #include <stdexcept>
 #include <vector>
 
+#include "harness/stringHelpers.h"
+
 #include <CL/cl_half.h>
 
 #include "test_comparisons_fp.h"
@@ -83,29 +85,6 @@ extension,
 // clang-format on
 
 
-std::string concat_kernel(const char* sstr[], int num)
-{
-    std::string res;
-    for (int i = 0; i < num; i++) res += std::string(sstr[i]);
-    return res;
-}
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args)
-{
-    int size_s = std::snprintf(nullptr, 0, format.c_str(), args...)
-        + 1; // Extra space for '\0'
-    if (size_s <= 0)
-    {
-        throw std::runtime_error("Error during formatting.");
-    }
-    auto size = static_cast<size_t>(size_s);
-    std::unique_ptr<char[]> buf(new char[size]);
-    std::snprintf(buf.get(), size, format.c_str(), args...);
-    return std::string(buf.get(),
-                       buf.get() + size - 1); // We don't want the '\0' inside
-}
-
 template <typename T, typename F> bool verify(const T& A, const T& B)
 {
     return F()(A, B);
@@ -226,14 +205,14 @@ int RelationalsFPTest::test_equiv_kernel(unsigned int vecSize,
             auto str =
                 concat_kernel(equivTestKerPat_3,
                               sizeof(equivTestKerPat_3) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKerPatLessGreater_3,
                                      sizeof(equivTestKerPatLessGreater_3)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
     else
@@ -243,14 +222,14 @@ int RelationalsFPTest::test_equiv_kernel(unsigned int vecSize,
             auto str =
                 concat_kernel(equivTestKernPat,
                               sizeof(equivTestKernPat) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKernPatLessGreater,
                                      sizeof(equivTestKernPatLessGreater)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
 

From 43c244f01de00e4d3beb63c4b9167eccfbdeaf77 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 27 Jun 2023 17:42:56 +0200
Subject: [PATCH 12/20] Added cl_khr_fp16 extension support for test
 vector_times_scalar from spirv_new (#1757)

* Added cl_khr_fp16 support for vector_times_scalar from spirv_new (issue #142, spirv_new)

* Logging correction
---
 .../vector_times_scalar_half.spvasm32         | 46 +++++++++++++++++
 .../vector_times_scalar_half.spvasm64         | 50 +++++++++++++++++++
 .../spirv_new/test_op_vector_times_scalar.cpp | 14 ++++++
 test_conformance/spirv_new/types.hpp          |  4 ++
 4 files changed, 114 insertions(+)
 create mode 100644 test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64

diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
new file mode 100644
index 0000000000..6fda7d8f18
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 25
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %16 = OpLabel
+         %17 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %18 = OpCompositeExtract %uint %17 0
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %18
+         %20 = OpLoad %v4half %19 Aligned 8
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %18
+         %22 = OpLoad %half %21 Aligned 2
+         %23 = OpVectorTimesScalar %v4half %20 %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %18
+               OpStore %24 %23 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
new file mode 100644
index 0000000000..fa2d522103
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
@@ -0,0 +1,50 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 28
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %21
+         %23 = OpLoad %v4half %22 Aligned 8
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %21
+         %25 = OpLoad %half %24 Aligned 2
+         %26 = OpVectorTimesScalar %v4half %23 %25
+         %27 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %21
+               OpStore %27 %26 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 0859668cbb..0be4e8b71c 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -17,6 +17,8 @@ or Khronos Conformance Test Source License Agreement as executed between Khronos
 #include <sstream>
 #include <string>
 
+using half = cl_half;
+
 template<typename Tv, typename Ts>
 int test_vector_times_scalar(cl_device_id deviceID,
                              cl_context context,
@@ -32,6 +34,16 @@ int test_vector_times_scalar(cl_device_id deviceID,
         }
     }
 
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info("Extension cl_khr_fp16 not supported; skipping half "
+                     "tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
     int num = (int)h_lhs.size();
     size_t lhs_bytes = num * sizeof(Tv);
@@ -171,5 +183,7 @@ int test_vector_times_scalar(cl_device_id deviceID,
                                                 lhs, rhs);      \
     }
 
+
 TEST_VECTOR_TIMES_SCALAR(float, 4)
 TEST_VECTOR_TIMES_SCALAR(double, 4)
+TEST_VECTOR_TIMES_SCALAR(half, 4)
diff --git a/test_conformance/spirv_new/types.hpp b/test_conformance/spirv_new/types.hpp
index e7fceba0cd..27a45c5b06 100644
--- a/test_conformance/spirv_new/types.hpp
+++ b/test_conformance/spirv_new/types.hpp
@@ -43,6 +43,8 @@ VEC_NOT_EQ_FUNC(cl_float, 2)
 VEC_NOT_EQ_FUNC(cl_float, 4)
 VEC_NOT_EQ_FUNC(cl_double, 2)
 VEC_NOT_EQ_FUNC(cl_double, 4)
+VEC_NOT_EQ_FUNC(cl_half, 2)
+VEC_NOT_EQ_FUNC(cl_half, 4)
 
 template<typename T>
 bool isNotEqual(const T &lhs, const T &rhs)
@@ -109,6 +111,8 @@ GENRAND_REAL_FUNC(cl_float, 2)
 GENRAND_REAL_FUNC(cl_float, 4)
 GENRAND_REAL_FUNC(cl_double, 2)
 GENRAND_REAL_FUNC(cl_double, 4)
+GENRAND_REAL_FUNC(cl_half, 2)
+GENRAND_REAL_FUNC(cl_half, 4)
 
 template<> inline cl_half genrandReal<cl_half>(RandomSeed &seed)
 {

From 73ead9da04c2983288799effaaa12dbd02ae321d Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 27 Jun 2023 17:43:44 +0200
Subject: [PATCH 13/20] Added cl_khr_fp16 extension support for test_op_negate
 from spirv_new (#1762)

* Added cl_khr_fp16 extension support for test_op_negate from spirv_new (issue #142)

* Added clang format fix
---
 .../spirv_new/spirv_asm/op_neg_half.spvasm32  | 35 ++++++++++++++++
 .../spirv_new/spirv_asm/op_neg_half.spvasm64  | 39 ++++++++++++++++++
 test_conformance/spirv_new/test_op_negate.cpp | 40 +++++++++++--------
 test_conformance/spirv_new/types.hpp          |  2 +
 4 files changed, 100 insertions(+), 16 deletions(-)
 create mode 100644 test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64

diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
new file mode 100644
index 0000000000..4912718745
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
@@ -0,0 +1,35 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 17
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %11 = OpLabel
+         %12 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %13 = OpCompositeExtract %uint %12 0
+         %14 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %13
+         %15 = OpLoad %half %14
+         %16 = OpFNegate %half %15
+               OpStore %14 %16
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
new file mode 100644
index 0000000000..9c7e3d6df4
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
@@ -0,0 +1,39 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 20
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %12 = OpLabel
+         %13 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %14 = OpCompositeExtract %ulong %13 0
+         %15 = OpShiftLeftLogical %ulong %14 %ulong_32
+         %16 = OpShiftRightArithmetic %ulong %15 %ulong_32
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %16
+         %18 = OpLoad %half %17
+         %19 = OpFNegate %half %18
+               OpStore %17 %19
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp
index e3dc1f349f..5009be9316 100644
--- a/test_conformance/spirv_new/test_op_negate.cpp
+++ b/test_conformance/spirv_new/test_op_negate.cpp
@@ -32,6 +32,15 @@ int test_negation(cl_device_id deviceID,
             return 0;
         }
     }
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
 
     cl_int err = CL_SUCCESS;
     int num = (int)h_in.size();
@@ -73,29 +82,28 @@ int test_negation(cl_device_id deviceID,
     return 0;
 }
 
-#define TEST_NEGATION(TYPE, Tv, OP, FUNC)       \
-    TEST_SPIRV_FUNC(OP##_##TYPE)                \
-    {                                           \
-        int num = 1 << 20;                      \
-        std::vector<Tv> in(num);                \
-        RandomSeed seed(gRandomSeed);           \
-        for (int i = 0; i < num; i++) {         \
-            in[i] = genrand<Tv>(seed);          \
-        }                                       \
-        return test_negation<Tv>(deviceID,      \
-                                 context,       \
-                                 queue,         \
-                                 #TYPE,         \
-                                 #OP,           \
-                                 in, FUNC);     \
-    }                                           \
+#define TEST_NEGATION(TYPE, Tv, OP, FUNC)                                      \
+    TEST_SPIRV_FUNC(OP##_##TYPE)                                               \
+    {                                                                          \
+        int num = 1 << 20;                                                     \
+        std::vector<Tv> in(num);                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Tv>(seed);                                         \
+        }                                                                      \
+        return test_negation<Tv>(deviceID, context, queue, #TYPE, #OP, in,     \
+                                 FUNC);                                        \
+    }
 
 
+#define TEST_NEG_HALF TEST_NEGATION(half, cl_half, op_neg, negOpHalf)
 #define TEST_NEG(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_neg, negOp<cl_##TYPE>)
 #define TEST_NOT(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_not, notOp<cl_##TYPE>)
 #define TEST_NEG_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_neg, (negOpVec<cl_##TYPE##N, N>))
 #define TEST_NOT_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_not, (notOpVec<cl_##TYPE##N, N>))
 
+TEST_NEG_HALF
 TEST_NEG(float)
 TEST_NEG(double)
 TEST_NEG(int)
diff --git a/test_conformance/spirv_new/types.hpp b/test_conformance/spirv_new/types.hpp
index 27a45c5b06..939e6fa8c0 100644
--- a/test_conformance/spirv_new/types.hpp
+++ b/test_conformance/spirv_new/types.hpp
@@ -161,6 +161,8 @@ Tv negOp(Tv in)
     return -in;
 }
 
+inline cl_half negOpHalf(cl_half v) { return v ^ 0x8000; }
+
 template<typename Tv>
 Tv notOp(Tv in)
 {

From fee6d6bb6643f7f5e2b6dab46486c903e2a71680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Jastrz=C4=99bski?= <p.k.jastrzebski@gmail.com>
Date: Tue, 27 Jun 2023 17:47:24 +0200
Subject: [PATCH 14/20] Command buffer re-enqueue testing. (#1738)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Command buffer re-enqueue testing.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Remove reenqueue tests and add reenqueue to existing tests.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Add re-enqueue for copy and barrier tests.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix review comments.

Applied review comments for command buffer re-enqueue testing:
- Add second clEnqueueCommandBufferKHR for all tests
- Reinitialise memory before second enqueue of command buffers
- Add different patterns for second enqueue of command buffers

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix verification patterns for second enqueue tests.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Reinitialise output memory for second command buffer re-enqueue.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix cast for conversion from 'const cl_char' to 'const cl_uint.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix compilation error for MSVC.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Reinitialise in_mem and out_mem with zero.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix cast for conversion from 'const cl_int' to 'const cl_uint'.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

---------

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>
---
 .../basic_command_buffer.cpp                  |  25 ++-
 .../command_buffer_test_barrier.cpp           |  36 +++-
 .../command_buffer_test_copy.cpp              | 196 +++++++++++++++---
 .../command_buffer_test_fill.cpp              |  68 ++++--
 4 files changed, 277 insertions(+), 48 deletions(-)

diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
index 43734da0a5..6c02f9f788 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -201,14 +201,33 @@ struct BasicEnqueueTest : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        const cl_int new_pattern = 12;
+        error = clEnqueueFillBuffer(queue, in_mem, &new_pattern, sizeof(cl_int),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(new_pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
index d73fc9ce7e..82ff16f0ec 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
@@ -70,15 +70,42 @@ struct BarrierWithWaitListKHR : public BasicCommandBufferTest
             0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
-                                    data_size(), output_data.data(), 1, &event,
-                                    nullptr);
+                                    data_size(), output_data_1.data(), 1,
+                                    &event, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error =
+            clEnqueueFillBuffer(queue, in_mem, &zero_pattern, sizeof(cl_int), 0,
+                                data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error =
+            clEnqueueFillBuffer(queue, out_mem, &zero_pattern, sizeof(cl_int),
+                                0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data_2.data(), 1,
+                                    &event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -106,6 +133,7 @@ struct BarrierWithWaitListKHR : public BasicCommandBufferTest
     }
 
     const cl_int pattern = 0x16;
+    const cl_int zero_pattern = 0x0;
     clCommandQueueWrapper out_of_order_queue;
     clCommandBufferWrapper out_of_order_command_buffer;
     clEventWrapper event;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
index 102ae761e6..7a1f0e6d54 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
@@ -38,7 +38,7 @@ struct CopyImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillImageKHR(command_buffer, nullptr, src_image,
-                                             fill_color, origin, region, 0,
+                                             fill_color_1, origin, region, 0,
                                              nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -56,13 +56,38 @@ struct CopyImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
-        error = clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0,
-                                   0, output_data.data(), 0, nullptr, nullptr);
+        std::vector<cl_char> output_data_1(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, src_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueFillImage(queue, dst_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -97,8 +122,12 @@ struct CopyImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x05;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x05;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x1;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
     clMemWrapper src_image;
     clMemWrapper dst_image;
@@ -111,7 +140,7 @@ struct CopyBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -127,20 +156,45 @@ struct CopyBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const cl_char pattern = 0x14;
+    const cl_char pattern_1 = 0x14;
+    const cl_char pattern_2 = 0x28;
 };
 
 struct CopyBufferToImageKHR : public BasicCommandBufferTest
@@ -150,7 +204,7 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, buffer, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, buffer, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -168,15 +222,40 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadImage failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillImage(queue, image, &fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -211,7 +290,14 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x11;
+    const cl_char pattern_1 = 0x11;
+    const cl_char pattern_2 = 0x22;
+
+    const cl_uint fill_color_2[4] = { static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2) };
+
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper buffer;
@@ -225,7 +311,7 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -243,16 +329,39 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -287,8 +396,12 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x12;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x12;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x24;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -302,7 +415,7 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -319,14 +432,38 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -353,7 +490,8 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x13;
+    const cl_char pattern_1 = 0x13;
+    const cl_char pattern_2 = 0x26;
 
     clMemWrapper in_mem;
     clMemWrapper out_mem;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
index 88e97a2715..0ba8055a14 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
@@ -35,7 +35,7 @@ struct FillImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -47,14 +47,34 @@ struct FillImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -85,8 +105,12 @@ struct FillImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x10;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x10;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x20;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -99,7 +123,7 @@ struct FillBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -111,20 +135,40 @@ struct FillBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
+        error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char), 0,
+                            data_size(), 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
         error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const char pattern = 0x15;
+    const char pattern_1 = 0x15;
+    const char pattern_2 = 0x30;
 };
 
 };

From 56974a58585b8c66d9beddccd984990e45ca0ad7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Jastrz=C4=99bski?= <p.k.jastrzebski@gmail.com>
Date: Tue, 27 Jun 2023 17:54:14 +0200
Subject: [PATCH 15/20] Add global offset tests for
 cl_khr_command_buffer_mutable_dispatch. (#1743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add global offset tests for cl_khr_command_buffer_mutable_dispatch.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Add kernel with observable output.

We should check that there's some observable output from the kernel
as a result of the change to global work offset, not just that
clGetMutableCommandInfoKHR has been updated. E.g we could call
get_global_offset() inside of the kernel, write something to a
buffer based on that, and read the buffer after the command-buffer
enqueue has finished.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix review comments.

Applied review comments for mutable dispatch global offset test:
- clFinish to ensure command-buffer has finished executing for calling clUpdateMutableCommandsKHR
- Change variable and constant names for global offset
- Remove redundant return CL_SUCCESS

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix review comments.

Changes made:
- Fix skip conditions
- Remove obsolete variable
- Replace a variable with a constant

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix review comments.

Changes made:
- Remove explicit base class call
- Fix constant magic number

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix constant magic number.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix clang-format.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

* Fix condition for result check.

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>

---------

Signed-off-by: Paweł Jastrzębski <p.k.jastrzebski@gmail.com>
---
 .../CMakeLists.txt                            |   1 +
 .../main.cpp                                  |   1 +
 .../mutable_command_basic.h                   |  63 ++++--
 .../mutable_command_global_offset.cpp         | 179 ++++++++++++++++++
 .../mutable_command_info.cpp                  |  60 +++---
 .../procs.h                                   |   6 +-
 6 files changed, 267 insertions(+), 43 deletions(-)
 create mode 100644 test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp

diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
index e06258335a..8021460947 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
@@ -3,6 +3,7 @@ set(MODULE_NAME CL_KHR_MUTABLE_DISPATCH)
 set(${MODULE_NAME}_SOURCES
     main.cpp
     mutable_command_info.cpp
+    mutable_command_global_offset.cpp
     ../basic_command_buffer.cpp
 )
 
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
index 97075792bc..b53914dc56 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
@@ -26,6 +26,7 @@ test_definition test_list[] = {
     ADD_TEST(mutable_command_info_global_work_offset),
     ADD_TEST(mutable_command_info_local_work_size),
     ADD_TEST(mutable_command_info_global_work_size),
+    ADD_TEST(mutable_dispatch_global_offset),
 };
 
 int main(int argc, const char *argv[])
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
index 966695834b..c88c14d1c7 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
@@ -19,6 +19,17 @@
 #include "../basic_command_buffer.h"
 #include "../command_buffer_test_base.h"
 
+// If it is supported get the addresses of all the APIs here.
+#define GET_EXTENSION_ADDRESS(FUNC)                                            \
+    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
+        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
+    if (FUNC == nullptr)                                                       \
+    {                                                                          \
+        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
+                  " with " #FUNC "\n");                                        \
+        return TEST_FAIL;                                                      \
+    }
+
 struct BasicMutableCommandBufferTest : BasicCommandBufferTest
 {
     BasicMutableCommandBufferTest(cl_device_id device, cl_context context,
@@ -84,24 +95,52 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest
                             &platform, nullptr);
         test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
 
-        // If it is supported get the addresses of all the APIs here.
-#define GET_EXTENSION_ADDRESS(FUNC)                                            \
-    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
-        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
-    if (FUNC == nullptr)                                                       \
-    {                                                                          \
-        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
-                  " with " #FUNC "\n");                                        \
-        return TEST_FAIL;                                                      \
+        GET_EXTENSION_ADDRESS(clUpdateMutableCommandsKHR);
+
+        return CL_SUCCESS;
     }
+
+    clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
+
+    const char* kernelString = "__kernel void empty() {}";
+    const size_t global_work_size = 4 * 16;
+};
+
+struct InfoMutableCommandBufferTest : BasicMutableCommandBufferTest
+{
+    InfoMutableCommandBufferTest(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        cl_int error = init_extension_functions();
+        test_error(error, "Unable to initialise extension functions");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int init_extension_functions()
+    {
+        BasicCommandBufferTest::init_extension_functions();
+
+        cl_platform_id platform;
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                            &platform, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
         GET_EXTENSION_ADDRESS(clGetMutableCommandInfoKHR);
 
         return CL_SUCCESS;
     }
 
     clGetMutableCommandInfoKHR_fn clGetMutableCommandInfoKHR = nullptr;
-    const char* kernelString = "__kernel void empty() {}";
-    const size_t global_work_size = 4 * sizeof(cl_int);
 };
 
-#endif // CL_KHR_MUTABLE_COMMAND_BASIC_H
+#undef GET_EXTENSION_ADDRESS
+
+#endif //_CL_KHR_MUTABLE_COMMAND_BASIC_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
new file mode 100644
index 0000000000..70e1d9b163
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
@@ -0,0 +1,179 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "typeWrappers.h"
+#include "procs.h"
+#include "testHarness.h"
+#include "imageHelpers.h"
+#include <vector>
+#include <iostream>
+#include <random>
+#include <cstring>
+#include <algorithm>
+#include <memory>
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR
+
+struct MutableDispatchGlobalOffset : InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchGlobalOffset(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *global_offset_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_global_offset(0);
+            })";
+
+        cl_int error =
+            create_single_kernel_helper(context, &program, &kernel, 1,
+                                        &global_offset_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            &update_global_offset /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR,
+            sizeof(info_global_offset), &info_global_offset, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_global_offset != update_global_offset)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i < update_global_offset && 0 != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i >= update_global_offset
+                     && update_global_offset != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+        return CL_SUCCESS;
+    }
+
+    size_t info_global_offset = 0;
+    const size_t update_global_offset = 3;
+    const size_t sizeToAllocate =
+        (global_work_size + update_global_offset) * sizeof(cl_int);
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_global_offset(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+
+    return MakeAndRunTest<MutableDispatchGlobalOffset>(device, context, queue,
+                                                       num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
index cc425a4d68..a8ed325ac7 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
@@ -42,13 +42,13 @@
 // CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR
 // CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR
 
-struct InfoDeviceQuery : public BasicMutableCommandBufferTest
+struct InfoDeviceQuery : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoDeviceQuery(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -71,12 +71,12 @@ struct InfoDeviceQuery : public BasicMutableCommandBufferTest
     }
 };
 
-struct InfoBuffer : public BasicMutableCommandBufferTest
+struct InfoBuffer : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoBuffer(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -108,13 +108,13 @@ struct InfoBuffer : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct PropertiesArray : public BasicMutableCommandBufferTest
+struct PropertiesArray : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     PropertiesArray(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -154,12 +154,12 @@ struct PropertiesArray : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Kernel : public BasicMutableCommandBufferTest
+struct Kernel : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Kernel(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -195,12 +195,12 @@ struct Kernel : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Dimensions : public BasicMutableCommandBufferTest
+struct Dimensions : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Dimensions(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -234,12 +234,12 @@ struct Dimensions : public BasicMutableCommandBufferTest
     const size_t dimensions = 3;
 };
 
-struct InfoType : public BasicMutableCommandBufferTest
+struct InfoType : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoType(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -271,12 +271,12 @@ struct InfoType : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoQueue : public BasicMutableCommandBufferTest
+struct InfoQueue : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoQueue(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -308,13 +308,13 @@ struct InfoQueue : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkOffset : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkOffset(cl_device_id device, cl_context context,
                          cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -346,13 +346,13 @@ struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
     size_t test_global_work_offset = 0;
 };
 
-struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkSize(cl_device_id device, cl_context context,
                        cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -383,13 +383,13 @@ struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
     size_t test_global_work_size = 0;
 };
 
-struct InfoLocalWorkSize : public BasicMutableCommandBufferTest
+struct InfoLocalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoLocalWorkSize(cl_device_id device, cl_context context,
                       cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
index 4b6dacb699..588bdc817e 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
@@ -59,4 +59,8 @@ extern int test_mutable_command_info_global_work_size(cl_device_id device,
                                                       cl_context context,
                                                       cl_command_queue queue,
                                                       int num_elements);
-#endif // CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+extern int test_mutable_dispatch_global_offset(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+#endif /*_CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H*/

From 75aca34e600a9ac0fbee524404a2ac7cf4d37801 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 28 Jun 2023 08:13:15 +0100
Subject: [PATCH 16/20] Test CL_COMMAND_BUFFER_CONTEXT_KHR (#1697)

Test coverage for spec PR https://github.com/KhronosGroup/OpenCL-Docs/pull/899
which introduces a new cl_khr_command_buffer query for the cl_context
---
 ...command_buffer_get_command_buffer_info.cpp | 54 +++++++++++++++++++
 .../extensions/cl_khr_command_buffer/main.cpp |  1 +
 .../extensions/cl_khr_command_buffer/procs.h  |  2 +
 3 files changed, 57 insertions(+)

diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
index d46b288877..1ada904d6b 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
@@ -26,6 +26,7 @@ enum class CombufInfoTestMode
     CITM_REF_COUNT,
     CITM_STATE,
     CITM_PROP_ARRAY,
+    CITM_CONTEXT,
 };
 
 namespace {
@@ -38,6 +39,7 @@ namespace {
 // -test case for CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR query
 // -test case for CL_COMMAND_BUFFER_STATE_KHR query
 // -test case for CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR query
+// -test case for CL_COMMAND_BUFFER_CONTEXT_KHR query
 
 template <CombufInfoTestMode test_mode>
 struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
@@ -70,6 +72,10 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
                 error = RunPropArrayInfoTest();
                 test_error(error, "RunPropArrayInfoTest failed");
                 break;
+            case CombufInfoTestMode::CITM_CONTEXT:
+                error = RunContextInfoTest();
+                test_error(error, "RunContextInfoTest failed");
+                break;
         }
 
         return CL_SUCCESS;
@@ -323,6 +329,46 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
         return TEST_FAIL;
     }
 
+    cl_int RunContextInfoTest()
+    {
+        cl_int error = TEST_PASS;
+
+        // record command buffers
+        error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
+        size_t ret_value_size = 0;
+        error = clGetCommandBufferInfoKHR(command_buffer,
+                                          CL_COMMAND_BUFFER_CONTEXT_KHR, 0,
+                                          nullptr, &ret_value_size);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+
+        test_assert_error(
+            ret_value_size == sizeof(cl_context),
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context ret_context = nullptr;
+        error = clGetCommandBufferInfoKHR(
+            command_buffer, CL_COMMAND_BUFFER_CONTEXT_KHR, sizeof(cl_context),
+            &ret_context, nullptr);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+        test_assert_error(
+            ret_context != nullptr,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context expected_context = nullptr;
+        error =
+            clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context),
+                                  &expected_context, nullptr);
+        test_error(error, "clGetCommandQueueInfo failed");
+
+        test_assert_error(
+            ret_context == expected_context,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        return TEST_PASS;
+    }
+
     const cl_int pattern = 0xE;
 };
 
@@ -360,3 +406,11 @@ int test_info_prop_array(cl_device_id device, cl_context context,
         CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_PROP_ARRAY>>(
         device, context, queue, num_elements);
 }
+
+int test_info_context(cl_device_id device, cl_context context,
+                      cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<
+        CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_CONTEXT>>(
+        device, context, queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
index 4eefc8ab1f..3562282746 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
@@ -26,6 +26,7 @@ test_definition test_list[] = {
     ADD_TEST(info_ref_count),
     ADD_TEST(info_state),
     ADD_TEST(info_prop_array),
+    ADD_TEST(info_context),
     ADD_TEST(basic_profiling),
     ADD_TEST(simultaneous_profiling),
     ADD_TEST(regular_wait_for_command_buffer),
diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h
index 53a7d93490..5c4e67fe35 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -41,6 +41,8 @@ extern int test_info_state(cl_device_id device, cl_context context,
                            cl_command_queue queue, int num_elements);
 extern int test_info_prop_array(cl_device_id device, cl_context context,
                                 cl_command_queue queue, int num_elements);
+extern int test_info_context(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements);
 extern int test_basic_set_kernel_arg(cl_device_id device, cl_context context,
                                      cl_command_queue queue, int num_elements);
 extern int test_pending_set_kernel_arg(cl_device_id device, cl_context context,

From 729cd8b7a94de09589d7703e59d266ab3eed8cdd Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Wed, 28 Jun 2023 09:34:07 +0100
Subject: [PATCH 17/20] [NFC] device_execution: use raw string literals for
 block kernels (#1767)

Modernize by using raw string literals, which makes the kernel sources
easier to read/extract.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 .../device_execution/enqueue_block.cpp        | 1061 ++++++++---------
 1 file changed, 519 insertions(+), 542 deletions(-)

diff --git a/test_conformance/device_execution/enqueue_block.cpp b/test_conformance/device_execution/enqueue_block.cpp
index 29a6cec15b..4ddd1db7f8 100644
--- a/test_conformance/device_execution/enqueue_block.cpp
+++ b/test_conformance/device_execution/enqueue_block.cpp
@@ -27,561 +27,538 @@
 
 #ifdef CL_VERSION_2_0
 extern int gWimpyMode;
-static const char* enqueue_simple_block[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_simple_block(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  res[tid] = -1;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
 
-static const char* enqueue_block_with_local_arg1[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg1(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+// clang-format off
+static const char* enqueue_simple_block[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
 
-static const char* enqueue_block_with_local_arg2[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp1[i]   = mul * 7 - 21;"
-    NL, "    tmp2[i].x = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].y = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].z = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].w = (float)(mul * 7 - 21);"
-    NL, ""
-    NL, "    res[tid] += tmp1[i];"
-    NL, "    res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg2(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)"
-    NL, "    { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_simple_block(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_with_wait_list[] =
-{
-    NL, "#define BLOCK_SUBMITTED 1"
-    NL, "#define BLOCK_COMPLETED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,"
-    NL, "  ^{"
-    NL, "      res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
 
-static const char* enqueue_block_with_wait_list_and_local_arg[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define BLOCK_STARTED   3"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  res[tid] = BLOCK_STARTED;"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  if(res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt, "
-    NL, "    ^(__local void* buf) {"
-    NL, "       block_fn_local_arg(tid, multiplier, res, (__local int*)buf);"
-    NL, "     }, LOCAL_MEM_SIZE*sizeof(int));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      res[tid] = -1;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_block_get_kernel_work_group_size[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_work_group_size(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_work_group_size(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+static const char* enqueue_block_with_local_arg1[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_block_capture_event_profiling_info_after_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const clk_event_t evt, __global int* res)"
-    NL, "{"
-    NL, "    capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "    release_event(evt);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)"
-    NL, "{"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg1(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_capture_event_profiling_info_before_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const ulong *value, __global int* res)"
-    NL, "{"
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, "    clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, "    clk_event_t block_evt2;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "    release_event(user_evt);"
-    NL, "    release_event(block_evt1);"
-    NL, "    release_event(block_evt2);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };
 
-static const char* enqueue_block_with_barrier[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  if(mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "  res[tid] = mul * 7 -21;"
-    NL, "}"
-    NL, ""
-    NL, "void loop_fn(size_t tid, int n, __global int* res)"
-    NL, "{"
-    NL, "  while(n > 0)"
-    NL, "  {"
-    NL, "    barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "    res[tid] = 0;"
-    NL, "    --n;"
-    NL, "  }"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_barrier(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  res[tid] = -1;"
-    NL, "  size_t n = 256;"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  ndrange_t ndrange = ndrange_1D(n);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_marker_with_block_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_block_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t block_evt1;"
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt1);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+static const char* enqueue_block_with_local_arg2[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_marker_with_user_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_user_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  uint multiplier = 7;"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, "  clk_event_t block_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt, "
-    NL, "  ^{"
-    NL, "     if(res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;"
-    NL, "   });"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp1[i]   = mul * 7 - 21;
+        tmp2[i].x = (float)(mul * 7 - 21);
+        tmp2[i].y = (float)(mul * 7 - 21);
+        tmp2[i].z = (float)(mul * 7 - 21);
+        tmp2[i].w = (float)(mul * 7 - 21);
+
+        res[tid] += tmp1[i];
+        res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_marker_with_mixed_events[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t mix_ev[2];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(mix_ev[0]);"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg2(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_with_mixed_events[] =
-{
-    NL, "kernel void enqueue_block_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  int enq_res;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t mix_ev[3];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  res[tid] = -2;"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -4; return; }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[0]);"
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(mix_ev[2]);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)
+        { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };
+
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list[] = { R"(
+    #define BLOCK_SUBMITTED 1
+    #define BLOCK_COMPLETED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_block_with_wait_list(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+      ^{
+          res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list_and_local_arg[] = { R"(
+    #define LOCAL_MEM_SIZE 10
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define BLOCK_STARTED   3
+    #define CHECK_SUCCESS   0
+
+    void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      res[tid] = BLOCK_STARTED;
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      if (res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;
+    }
+
+    kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+        ^(__local void* buf) {
+           block_fn_local_arg(tid, multiplier, res, (__local int*)buf);
+         }, LOCAL_MEM_SIZE*sizeof(int));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_work_group_size[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_work_group_size(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_work_group_size(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_after_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const clk_event_t evt, __global int* res)
+    {
+        capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+        release_event(evt);
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)
+    {
+        size_t tid = get_global_id(0);
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_before_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const ulong *value, __global int* res)
+    {
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+        clk_event_t user_evt = create_user_event();
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+        clk_event_t block_evt2;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        set_user_event_status(user_evt, CL_COMPLETE);
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+        release_event(user_evt);
+        release_event(block_evt1);
+        release_event(block_evt2);
+    }
+)" };
+
+static const char* enqueue_block_with_barrier[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      if (mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);
+      res[tid] = mul * 7 -21;
+    }
+
+    void loop_fn(size_t tid, int n, __global int* res)
+    {
+      while (n > 0)
+      {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        res[tid] = 0;
+        --n;
+      }
+    }
+
+    kernel void enqueue_block_with_barrier(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      queue_t def_q = get_default_queue();
+      res[tid] = -1;
+      size_t n = 256;
+
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+      ndrange_t ndrange = ndrange_1D(n);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_marker_with_block_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_block_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t block_evt1;
+      clk_event_t marker_evt;
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt1);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_user_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_user_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+      uint multiplier = 7;
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t marker_evt;
+      clk_event_t block_evt;
+
+      int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt,
+      ^{
+         if (res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;
+       });
+
+      //check block is not started
+      if (res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_mixed_events[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_mixed_events(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t mix_ev[2];
+      mix_ev[0] = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      clk_event_t marker_evt;
+
+      enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[1]);
+      release_event(marker_evt);
+      release_event(mix_ev[0]);
+    }
+)" };
+
+static const char* enqueue_block_with_mixed_events[] = { R"(
+    kernel void enqueue_block_with_mixed_events(__global int* res)
+    {
+      int enq_res;
+      size_t tid = get_global_id(0);
+      clk_event_t mix_ev[3];
+      mix_ev[0] = create_user_event();
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      res[tid] = -2;
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -4; return; }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[0]);
+      release_event(mix_ev[1]);
+      release_event(mix_ev[2]);
+    }
+)" };
+// clang-format on
 
 static const kernel_src sources_enqueue_block[] =
 {

From 845ec694bbc333a563de33e5cce8e541a7b8b910 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 29 Jun 2023 12:20:15 +0100
Subject: [PATCH 18/20] workgroups: fix -Wsign-compare warnings (#1774)

In preparation of re-enabling -Wsign-compare globally, fix some
instances of this warning.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_conformance/workgroups/test_wg_all.cpp           |  3 +--
 test_conformance/workgroups/test_wg_any.cpp           |  3 +--
 test_conformance/workgroups/test_wg_broadcast.cpp     | 11 ++++-------
 .../workgroups/test_wg_suggested_local_work_size.cpp  |  4 ++--
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/test_conformance/workgroups/test_wg_all.cpp b/test_conformance/workgroups/test_wg_all.cpp
index 41abd1249f..f9b574e454 100644
--- a/test_conformance/workgroups/test_wg_all.cpp
+++ b/test_conformance/workgroups/test_wg_all.cpp
@@ -75,7 +75,6 @@ test_work_group_all(cl_device_id device, cl_context context, cl_command_queue qu
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@ test_work_group_all(cl_device_id device, cl_context context, cl_command_queue qu
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_any.cpp b/test_conformance/workgroups/test_wg_any.cpp
index e0242cfb44..f7ff899a33 100644
--- a/test_conformance/workgroups/test_wg_any.cpp
+++ b/test_conformance/workgroups/test_wg_any.cpp
@@ -75,7 +75,6 @@ test_work_group_any(cl_device_id device, cl_context context, cl_command_queue qu
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@ test_work_group_any(cl_device_id device, cl_context context, cl_command_queue qu
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index e24ac7b986..a4cb0c6fe2 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -70,7 +70,7 @@ verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
 
     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
     {
-        int local_size = (n-i) > wg_size ? wg_size : (n-i);
+        size_t local_size = (n - i) > wg_size ? wg_size : (n - i);
         float broadcast_result = inptr[i + (group_id % local_size)];
         for (j=0; j<local_size; j++)
         {
@@ -172,7 +172,6 @@ test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -207,7 +206,7 @@ test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -278,7 +277,6 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -333,7 +331,7 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -402,7 +400,6 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -458,7 +455,7 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
index 648e68ce1d..989f1dfd9f 100644
--- a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -219,7 +219,7 @@ int do_test(cl_device_id device, cl_context context, cl_command_queue queue,
 int do_test_work_group_suggested_local_size(
     cl_device_id device, cl_context context, cl_command_queue queue,
     bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
-    cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim)
+    cl_ulong max_local_mem_size, size_t global_work_offset[], num_dims dim)
 {
     clProgramWrapper scan_program;
     clKernelWrapper scan_kernel;
@@ -300,7 +300,7 @@ int test_work_group_suggested_local_size_1D(cl_device_id device,
                  "Skipping the test.\n");
         return TEST_SKIPPED_ITSELF;
     }
-    cl_long max_local_mem_size;
+    cl_ulong max_local_mem_size;
     cl_int err =
         clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);

From 9b0f78549aed3086cd564e52b2c08817c0df673d Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Fri, 30 Jun 2023 11:22:43 +0100
Subject: [PATCH 19/20] workgroups: fix program/kernel object leak (#1775)

`create_single_kernel_helper` is called in a loop, overwriting the
objects contained in the wrapper classes.  The wrapper class is not
aware of this, as the overwriting happens through its `operator&`.

Move the wrapper objects into the loop, so that the contained objects
get released as soon as the program and kernel objects are no longer
needed.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 .../workgroups/test_wg_suggested_local_work_size.cpp          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
index 989f1dfd9f..a31fca63f8 100644
--- a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -221,8 +221,6 @@ int do_test_work_group_suggested_local_size(
     bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
     cl_ulong max_local_mem_size, size_t global_work_offset[], num_dims dim)
 {
-    clProgramWrapper scan_program;
-    clKernelWrapper scan_kernel;
     int err;
     size_t test_values[] = { 1, 1, 1 };
     std::string kernel_names[6] = {
@@ -244,6 +242,8 @@ int do_test_work_group_suggested_local_size(
     for (int kernel_num = 0; kernel_num < 6; kernel_num++)
     {
         if (max_local_mem_size < local_mem_size[kernel_num]) continue;
+        clProgramWrapper scan_program;
+        clKernelWrapper scan_kernel;
         // Create the kernel
         err = create_single_kernel_helper(
             context, &scan_program, &scan_kernel, 1,

From 9e8430a6a69b4c2f2c714137a68e460ae8f14515 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Mon, 3 Jul 2023 10:07:32 +0100
Subject: [PATCH 20/20] [NFC] clang-format basic/test_enqueue_map.cpp (#1777)

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_conformance/basic/test_enqueue_map.cpp | 308 +++++++++++---------
 1 file changed, 168 insertions(+), 140 deletions(-)

diff --git a/test_conformance/basic/test_enqueue_map.cpp b/test_conformance/basic/test_enqueue_map.cpp
index d28f7e41a3..6b650c0d82 100644
--- a/test_conformance/basic/test_enqueue_map.cpp
+++ b/test_conformance/basic/test_enqueue_map.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -26,6 +26,7 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
+// clang-format off
 const cl_mem_flags flag_set[] = {
   CL_MEM_ALLOC_HOST_PTR,
   CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
@@ -33,93 +34,104 @@ const cl_mem_flags flag_set[] = {
   CL_MEM_COPY_HOST_PTR,
   0
 };
-const char* flag_set_names[] = {
+
+const char *flag_set_names[] = {
   "CL_MEM_ALLOC_HOST_PTR",
   "CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR",
   "CL_MEM_USE_HOST_PTR",
   "CL_MEM_COPY_HOST_PTR",
   "0"
 };
+// clang-format on
 
-int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
 {
     int error;
-    const size_t bufferSize = 256*256;
-    MTdataHolder d{gRandomSeed};
+    const size_t bufferSize = 256 * 256;
+    MTdataHolder d{ gRandomSeed };
     BufferOwningPtr<cl_char> hostPtrData{ malloc(bufferSize) };
     BufferOwningPtr<cl_char> referenceData{ malloc(bufferSize) };
-    BufferOwningPtr<cl_char> finalData{malloc(bufferSize)};
+    BufferOwningPtr<cl_char> finalData{ malloc(bufferSize) };
 
-    for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
+    for (int src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
     {
         clMemWrapper memObject;
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
 
         generate_random_data(kChar, (unsigned int)bufferSize, d, hostPtrData);
         memcpy(referenceData, hostPtrData, bufferSize);
 
         void *hostPtr = nullptr;
         cl_mem_flags flags = flag_set[src_flag_id];
-        bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
         if (hasHostPtr) hostPtr = hostPtrData;
-        memObject = clCreateBuffer(context, flags,  bufferSize, hostPtr, &error);
-        test_error( error, "Unable to create testing buffer" );
+        memObject = clCreateBuffer(context, flags, bufferSize, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
         if (!hasHostPtr)
         {
             error =
-            clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
-                                 hostPtrData, 0, NULL, NULL);
-            test_error( error, "clEnqueueWriteBuffer failed");
+                clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                     hostPtrData, 0, NULL, NULL);
+            test_error(error, "clEnqueueWriteBuffer failed");
         }
 
-        for( int i = 0; i < 128; i++ )
+        for (int i = 0; i < 128; i++)
         {
 
-          size_t offset = (size_t)random_in_range( 0, (int)bufferSize - 1, d );
-          size_t length = (size_t)random_in_range( 1, (int)( bufferSize - offset ), d );
-
-          cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                                offset, length, 0, NULL, NULL, &error );
-          if( error != CL_SUCCESS )
-          {
-            print_error( error, "clEnqueueMapBuffer call failed" );
-            log_error( "\tOffset: %d  Length: %d\n", (int)offset, (int)length );
-            return -1;
-          }
-
-          // Write into the region
-          for( size_t j = 0; j < length; j++ )
-          {
-            cl_char spin = (cl_char)genrand_int32( d );
-
-            // Test read AND write in one swipe
-            cl_char value = mappedRegion[ j ];
-            value = spin - value;
-            mappedRegion[ j ] = value;
-
-            // Also update the initial data array
-            value = referenceData[offset + j];
-            value = spin - value;
-            referenceData[offset + j] = value;
-          }
-
-          // Unmap
-          error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-          test_error( error, "Unable to unmap buffer" );
+            size_t offset = (size_t)random_in_range(0, (int)bufferSize - 1, d);
+            size_t length =
+                (size_t)random_in_range(1, (int)(bufferSize - offset), d);
+
+            cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                length, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapBuffer call failed");
+                log_error("\tOffset: %d  Length: %d\n", (int)offset,
+                          (int)length);
+                return -1;
+            }
+
+            // Write into the region
+            for (size_t j = 0; j < length; j++)
+            {
+                cl_char spin = (cl_char)genrand_int32(d);
+
+                // Test read AND write in one swipe
+                cl_char value = mappedRegion[j];
+                value = spin - value;
+                mappedRegion[j] = value;
+
+                // Also update the initial data array
+                value = referenceData[offset + j];
+                value = spin - value;
+                referenceData[offset + j] = value;
+            }
+
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
         }
 
-        // Final validation: read actual values of buffer and compare against our reference
-        error = clEnqueueReadBuffer( queue, memObject, CL_TRUE, 0, bufferSize, finalData, 0, NULL, NULL );
-        test_error( error, "Unable to read results" );
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        error = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                    finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
 
-        for( size_t q = 0; q < bufferSize; q++ )
+        for (size_t q = 0; q < bufferSize; q++)
         {
             if (referenceData[q] != finalData[q])
             {
                 log_error(
-                "ERROR: Sample %d did not validate! Got %d, expected %d\n",
-                (int)q, (int)finalData[q], (int)referenceData[q]);
+                    "ERROR: Sample %d did not validate! Got %d, expected %d\n",
+                    (int)q, (int)finalData[q], (int)referenceData[q]);
                 return -1;
             }
         }
@@ -128,112 +140,128 @@ int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_comman
     return 0;
 }
 
-int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_image(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
 {
     int error;
     cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT32 };
     const size_t imageSize = 256;
     const size_t imageDataSize = imageSize * imageSize * 4 * sizeof(cl_uint);
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     BufferOwningPtr<cl_uint> hostPtrData{ malloc(imageDataSize) };
     BufferOwningPtr<cl_uint> referenceData{ malloc(imageDataSize) };
-    BufferOwningPtr<cl_uint> finalData{malloc(imageDataSize)};
-
-    MTdataHolder d{gRandomSeed};
-  for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++) {
-    clMemWrapper memObject;
-    log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
-
-    generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4), d,
-                         hostPtrData);
-    memcpy(referenceData, hostPtrData, imageDataSize);
-
-    cl_mem_flags flags = flag_set[src_flag_id];
-    bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
-    void *hostPtr = nullptr;
-    if (hasHostPtr) hostPtr = hostPtrData;
-    memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
-                                imageSize, imageSize, 0, hostPtr, &error );
-    test_error( error, "Unable to create testing buffer" );
-
-    if (!hasHostPtr) {
-      size_t write_origin[3]={0,0,0}, write_region[3]={imageSize, imageSize, 1};
-      error =
-      clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin, write_region,
-                          0, 0, hostPtrData, 0, NULL, NULL);
-      test_error( error, "Unable to write to testing buffer" );
-    }
-
-    for( int i = 0; i < 128; i++ )
+    BufferOwningPtr<cl_uint> finalData{ malloc(imageDataSize) };
+
+    MTdataHolder d{ gRandomSeed };
+    for (int src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
     {
+        clMemWrapper memObject;
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
+
+        generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4),
+                             d, hostPtrData);
+        memcpy(referenceData, hostPtrData, imageDataSize);
+
+        cl_mem_flags flags = flag_set[src_flag_id];
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        void *hostPtr = nullptr;
+        if (hasHostPtr) hostPtr = hostPtrData;
+        memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
+                                    imageSize, imageSize, 0, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
-      size_t offset[3], region[3];
-      size_t rowPitch;
-
-      offset[ 0 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 0 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 0 ] - 1), d );
-      offset[ 1 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 1 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 1 ] - 1), d );
-      offset[ 2 ] = 0;
-      region[ 2 ] = 1;
-      cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                           offset, region, &rowPitch, NULL, 0, NULL, NULL, &error );
-      if( error != CL_SUCCESS )
-      {
-        print_error( error, "clEnqueueMapImage call failed" );
-        log_error( "\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0], (int)offset[1], (int)region[0], (int)region[1] );
-        return -1;
-      }
-
-      // Write into the region
-      cl_uint *mappedPtr = mappedRegion;
-      for( size_t y = 0; y < region[ 1 ]; y++ )
-      {
-        for( size_t x = 0; x < region[ 0 ] * 4; x++ )
+        if (!hasHostPtr)
         {
-          cl_int spin = (cl_int)random_in_range( 16, 1024, d );
-
-          cl_int value;
-          // Test read AND write in one swipe
-          value = mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ];
-          value = spin - value;
-          mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ] = value;
-
-          // Also update the initial data array
-          value =
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x];
-          value = spin - value;
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x] =
-          value;
+            size_t write_origin[3] = { 0, 0, 0 },
+                   write_region[3] = { imageSize, imageSize, 1 };
+            error = clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin,
+                                        write_region, 0, 0, hostPtrData, 0,
+                                        NULL, NULL);
+            test_error(error, "Unable to write to testing buffer");
         }
-      }
 
-      // Unmap
-      error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-      test_error( error, "Unable to unmap buffer" );
-    }
+        for (int i = 0; i < 128; i++)
+        {
 
-    // Final validation: read actual values of buffer and compare against our reference
-    size_t finalOrigin[3] = { 0, 0, 0 }, finalRegion[3] = { imageSize, imageSize, 1 };
-    error = clEnqueueReadImage( queue, memObject, CL_TRUE, finalOrigin, finalRegion, 0, 0, finalData, 0, NULL, NULL );
-    test_error( error, "Unable to read results" );
+            size_t offset[3], region[3];
+            size_t rowPitch;
+
+            offset[0] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[0] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[0] - 1), d);
+            offset[1] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[1] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[1] - 1), d);
+            offset[2] = 0;
+            region[2] = 1;
+            cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                region, &rowPitch, NULL, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapImage call failed");
+                log_error("\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0],
+                          (int)offset[1], (int)region[0], (int)region[1]);
+                return -1;
+            }
 
-    for( size_t q = 0; q < imageSize * imageSize * 4; q++ )
-    {
-        if (referenceData[q] != finalData[q])
+            // Write into the region
+            cl_uint *mappedPtr = mappedRegion;
+            for (size_t y = 0; y < region[1]; y++)
+            {
+                for (size_t x = 0; x < region[0] * 4; x++)
+                {
+                    cl_int spin = (cl_int)random_in_range(16, 1024, d);
+
+                    cl_int value;
+                    // Test read AND write in one swipe
+                    value = mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x];
+                    value = spin - value;
+                    mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x] = value;
+
+                    // Also update the initial data array
+                    value =
+                        referenceData[((offset[1] + y) * imageSize + offset[0])
+                                          * 4
+                                      + x];
+                    value = spin - value;
+                    referenceData[((offset[1] + y) * imageSize + offset[0]) * 4
+                                  + x] = value;
+                }
+            }
+
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
+        }
+
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        size_t finalOrigin[3] = { 0, 0, 0 },
+               finalRegion[3] = { imageSize, imageSize, 1 };
+        error = clEnqueueReadImage(queue, memObject, CL_TRUE, finalOrigin,
+                                   finalRegion, 0, 0, finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+
+        for (size_t q = 0; q < imageSize * imageSize * 4; q++)
         {
-            log_error("ERROR: Sample %d (coord %d,%d) did not validate! Got "
-                      "%d, expected %d\n",
-                      (int)q, (int)((q / 4) % imageSize),
-                      (int)((q / 4) / imageSize), (int)finalData[q],
-                      (int)referenceData[q]);
-            return -1;
+            if (referenceData[q] != finalData[q])
+            {
+                log_error(
+                    "ERROR: Sample %d (coord %d,%d) did not validate! Got "
+                    "%d, expected %d\n",
+                    (int)q, (int)((q / 4) % imageSize),
+                    (int)((q / 4) / imageSize), (int)finalData[q],
+                    (int)referenceData[q]);
+                return -1;
+            }
         }
-    }
-  } // cl_mem_flags
+    } // cl_mem_flags
 
     return 0;
 }
-