diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 1ba63abdd..635e4a7eb 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -13,16 +13,16 @@ jobs:
       fail-fast: false
       matrix:
         mainmatrix: [true]
-        os: [ubuntu-20.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, macos-latest, windows-latest]
         include:
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: true
             gl: 1
             extra: " gl"
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: false
             arch: arm
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             mainmatrix: false
             arch: aarch64
             debug: 1
@@ -55,10 +55,10 @@ jobs:
         run: ./presubmit.sh
   formatcheck:
     name: Check code format
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - name: Install packages
-        run: sudo apt install -y clang-format clang-format-9
+        run: sudo apt install -y clang-format clang-format-11
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
diff --git a/check-format.sh b/check-format.sh
index be8f9d785..b5dc0a72c 100755
--- a/check-format.sh
+++ b/check-format.sh
@@ -2,7 +2,7 @@
 
 # Arg used to specify non-'origin/main' comparison branch
 ORIGIN_BRANCH=${1:-"origin/main"}
-CLANG_BINARY=${2:-"`which clang-format-9`"}
+CLANG_BINARY=${2:-"`which clang-format-11`"}
 
 # Run git-clang-format to check for violations
 CLANG_FORMAT_OUTPUT=$(git-clang-format --diff $ORIGIN_BRANCH --extensions c,cpp,h,hpp --binary $CLANG_BINARY)
diff --git a/test_common/gl/helpers.cpp b/test_common/gl/helpers.cpp
index b9f95a94a..1fb85035e 100644
--- a/test_common/gl/helpers.cpp
+++ b/test_common/gl/helpers.cpp
@@ -1715,7 +1715,7 @@ void * CreateGLRenderbuffer( GLsizei width, GLsizei height,
         // Reverse and reorder to validate since in the
         // kernel the read_imagef() call always returns RGBA
         cl_uchar *p = (cl_uchar *)buffer;
-        for( size_t i = 0; i < (size_t)width * height; i++ )
+        for (GLsizei i = 0; i < width * height; i++)
         {
             cl_uchar uc0 = p[i * 4 + 0];
             cl_uchar uc1 = p[i * 4 + 1];
@@ -1733,7 +1733,7 @@ void * CreateGLRenderbuffer( GLsizei width, GLsizei height,
       // Reverse and reorder to validate since in the
       // kernel the read_imagef() call always returns RGBA
       cl_uchar *p = (cl_uchar *)buffer;
-      for( size_t i = 0; i < width * height; i++ )
+      for (GLsizei i = 0; i < width * height; i++)
       {
         cl_uchar uc0 = p[i * 4 + 0];
         cl_uchar uc1 = p[i * 4 + 1];
diff --git a/test_common/harness/mt19937.cpp b/test_common/harness/mt19937.cpp
index f5665deb2..2d503eb5f 100644
--- a/test_common/harness/mt19937.cpp
+++ b/test_common/harness/mt19937.cpp
@@ -51,6 +51,7 @@
 #include "harness/alloc.h"
 
 #ifdef __SSE2__
+#include <mutex>
 #include <emmintrin.h>
 #endif
 
@@ -107,7 +108,7 @@ cl_uint genrand_int32(MTdata d)
     /* mag01[x] = x * MATRIX_A  for x=0,1 */
     static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
 #ifdef __SSE2__
-    static volatile int init = 0;
+    static std::once_flag init_flag;
     static union {
         __m128i v;
         cl_uint s[4];
@@ -123,8 +124,7 @@ cl_uint genrand_int32(MTdata d)
         int kk;
 
 #ifdef __SSE2__
-        if (0 == init)
-        {
+        auto init_fn = []() {
             upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
                 upper_mask.s[3] = UPPER_MASK;
             lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
@@ -134,8 +134,8 @@ cl_uint genrand_int32(MTdata d)
                 MATRIX_A;
             c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
             c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
-            init = 1;
-        }
+        };
+        std::call_once(init_flag, init_fn);
 #endif
 
         kk = 0;
diff --git a/test_common/harness/stringHelpers.h b/test_common/harness/stringHelpers.h
index 3f6bf64db..e1275f103 100644
--- a/test_common/harness/stringHelpers.h
+++ b/test_common/harness/stringHelpers.h
@@ -14,10 +14,11 @@
 // limitations under the License.
 //
 
-#ifndef BASIC_UTILS_H
-#define BASIC_UTILS_H
+#ifndef STRING_HELPERS_H
+#define STRING_HELPERS_H
 
 #include <memory>
+#include <stdexcept>
 #include <string>
 
 inline std::string concat_kernel(const char *sstr[], int num)
@@ -38,4 +39,4 @@ inline std::string str_sprintf(const std::string &str, Args... args)
     return std::string(buffer.get(), buffer.get() + s - 1);
 }
 
-#endif // BASIC_UTIL_H
+#endif // STRING_HELPERS_H
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 95ea81631..3d743e717 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -835,9 +835,9 @@ void callTestFunctions(test_definition testList[],
         std::vector<std::thread *> threads;
         test_harness_state state = { testList, resultTestList, deviceToUse,
                                      config };
-        for (int i = 0; i < config.numWorkerThreads; i++)
+        for (unsigned i = 0; i < config.numWorkerThreads; i++)
         {
-            log_info("Spawning worker thread %i\n", i);
+            log_info("Spawning worker thread %u\n", i);
             threads.push_back(new std::thread(test_function_runner, &state));
         }
 
diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp
index 43e81277e..827072fc7 100644
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -326,6 +326,7 @@ int main(int argc, const char *argv[])
         else if ( strcmp( argv[i], "--help" ) == 0 || strcmp( argv[i], "-h" ) == 0 )
         {
             printUsage( argv[0] );
+            free(argList);
             return -1;
         }
 
diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index fa5c227fa..92ae1d7b1 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -799,8 +799,8 @@ int test_kernel_required_group_size(cl_device_id deviceID, cl_context context, c
         test_error(error, "clFinish failed");
 
         if (max_dimensions == 2) {
-            return 0;
             free(source);
+            return 0;
         }
 
         local[1]--; local[2]++;
diff --git a/test_conformance/atomics/test_indexed_cases.cpp b/test_conformance/atomics/test_indexed_cases.cpp
index 7da2dfa79..ce0410bcf 100644
--- a/test_conformance/atomics/test_indexed_cases.cpp
+++ b/test_conformance/atomics/test_indexed_cases.cpp
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
+#include <memory>
+
 #include "testBase.h"
 #include "harness/conversions.h"
 
@@ -226,13 +229,13 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
              (int)global_threads[0], (int)local_threads[0]);
 
     // Allocate our storage
-    cl_mem bin_counters =
+    clMemWrapper bin_counters =
         clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(cl_int) * number_of_bins, NULL, NULL);
-    cl_mem bins = clCreateBuffer(
+    clMemWrapper bins = clCreateBuffer(
         context, CL_MEM_READ_WRITE,
         sizeof(cl_int) * number_of_bins * max_counts_per_bin, NULL, NULL);
-    cl_mem bin_assignments =
+    clMemWrapper bin_assignments =
         clCreateBuffer(context, CL_MEM_READ_ONLY,
                        sizeof(cl_int) * number_of_items, NULL, NULL);
 
@@ -253,7 +256,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
     }
 
     // Initialize our storage
-    cl_int *l_bin_counts = (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    std::unique_ptr<cl_int[]> l_bin_counts(new cl_int[number_of_bins]);
     if (!l_bin_counts)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -263,8 +266,8 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
     int i;
     for (i = 0; i < number_of_bins; i++) l_bin_counts[i] = 0;
     err = clEnqueueWriteBuffer(queue, bin_counters, true, 0,
-                               sizeof(cl_int) * number_of_bins, l_bin_counts, 0,
-                               NULL, NULL);
+                               sizeof(cl_int) * number_of_bins,
+                               l_bin_counts.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to set initial values for "
@@ -273,8 +276,8 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
         return -1;
     }
 
-    cl_int *values =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    std::unique_ptr<cl_int[]> values(
+        new cl_int[number_of_bins * max_counts_per_bin]);
     if (!values)
     {
         log_error(
@@ -285,7 +288,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
     err = clEnqueueWriteBuffer(queue, bins, true, 0,
                                sizeof(cl_int) * number_of_bins
                                    * max_counts_per_bin,
-                               values, 0, NULL, NULL);
+                               values.get(), 0, NULL, NULL);
     if (err)
     {
         log_error(
@@ -293,10 +296,8 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
             err);
         return -1;
     }
-    free(values);
 
-    cl_int *l_bin_assignments =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_items);
+    std::unique_ptr<cl_int[]> l_bin_assignments(new cl_int[number_of_items]);
     if (!l_bin_assignments)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -326,7 +327,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
     }
     err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0,
                                sizeof(cl_int) * number_of_items,
-                               l_bin_assignments, 0, NULL, NULL);
+                               l_bin_assignments.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to set initial values for "
@@ -355,8 +356,8 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
         return -1;
     }
 
-    cl_int *final_bin_assignments =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin);
+    std::unique_ptr<cl_int[]> final_bin_assignments(
+        new cl_int[number_of_bins * max_counts_per_bin]);
     if (!final_bin_assignments)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -366,15 +367,14 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
     err = clEnqueueReadBuffer(queue, bins, true, 0,
                               sizeof(cl_int) * number_of_bins
                                   * max_counts_per_bin,
-                              final_bin_assignments, 0, NULL, NULL);
+                              final_bin_assignments.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
         return -1;
     }
 
-    cl_int *final_bin_counts =
-        (cl_int *)malloc(sizeof(cl_int) * number_of_bins);
+    std::unique_ptr<cl_int[]> final_bin_counts(new cl_int[number_of_bins]);
     if (!final_bin_counts)
     {
         log_error("add_index_bin_test FAILED to allocate initial values for "
@@ -382,8 +382,8 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
         return -1;
     }
     err = clEnqueueReadBuffer(queue, bin_counters, true, 0,
-                              sizeof(cl_int) * number_of_bins, final_bin_counts,
-                              0, NULL, NULL);
+                              sizeof(cl_int) * number_of_bins,
+                              final_bin_counts.get(), 0, NULL, NULL);
     if (err)
     {
         log_error("add_index_bin_test FAILED to read back bin_counters: %d\n",
@@ -460,13 +460,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
             errors++;
         }
     }
-    free(l_bin_counts);
-    free(l_bin_assignments);
-    free(final_bin_assignments);
-    free(final_bin_counts);
-    clReleaseMemObject(bin_counters);
-    clReleaseMemObject(bins);
-    clReleaseMemObject(bin_assignments);
+
     if (errors == 0)
     {
         log_info("add_index_bin_test passed. Each item was put in the correct "
diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index 47c1c980f..9dcf1d5a6 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -52,14 +52,12 @@ set(${MODULE_NAME}_SOURCES
     test_kernel_call_kernel_function.cpp
     test_local_kernel_scope.cpp
     test_progvar.cpp
-    test_wg_barrier.cpp
     test_global_linear_id.cpp
     test_local_linear_id.cpp
     test_enqueued_local_size.cpp
     test_simple_image_pitch.cpp
     test_get_linear_ids.cpp
     test_rw_image_access_qualifier.cpp
-    test_wg_barrier.cpp
     test_enqueued_local_size.cpp
     test_global_linear_id.cpp
     test_local_linear_id.cpp
diff --git a/test_conformance/basic/test_barrier.cpp b/test_conformance/basic/test_barrier.cpp
index d20af14a4..6352b42fa 100644
--- a/test_conformance/basic/test_barrier.cpp
+++ b/test_conformance/basic/test_barrier.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,143 +21,136 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <numeric>
+#include <vector>
 
 #include "procs.h"
 
-const char *barrier_kernel_code =
-"__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
-"{\n"
-"    int  tid = get_local_id(0);\n"
-"    int  lsize = get_local_size(0);\n"
-"    int  i;\n"
-"\n"
-"    tmp_sum[tid] = 0;\n"
-"    for (i=tid; i<n; i+=lsize)\n"
-"        tmp_sum[tid] += a[i];\n"
-"     \n"
-"     // updated to work for any workgroup size \n"
-"    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
-"    {\n"
-"        barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"        if (tid + i < lsize)\n"
-"            tmp_sum[tid] += tmp_sum[tid + i];\n"
-"         lsize = i; \n"
-"    }\n"
-"\n"
-"     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
-"    if (tid == 0)\n"
-"        *sum = tmp_sum[0];\n"
-"}\n";
-
-
-static int
-verify_sum(int *inptr, int *outptr, int n)
+namespace {
+const char *barrier_kernel_code = R"(
+__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum,
+                          __global int *sum)
 {
-  int            r = 0;
-  int         i;
+    int tid = get_local_id(0);
+    int lsize = get_local_size(0);
+    int i;
 
-  for (i=0; i<n; i++)
-  {
-        r += inptr[i];
-  }
+    tmp_sum[tid] = 0;
+    for (i = tid; i < n; i += lsize) tmp_sum[tid] += a[i];
 
-    if (r != outptr[0])
+    // updated to work for any workgroup size
+    for (i = hadd(lsize, 1); lsize > 1; i = hadd(i, 1))
     {
-        log_error("BARRIER test failed\n");
-        return -1;
+        BARRIER(CLK_GLOBAL_MEM_FENCE);
+        if (tid + i < lsize) tmp_sum[tid] += tmp_sum[tid + i];
+        lsize = i;
     }
 
-  log_info("BARRIER test passed\n");
-  return 0;
+    // no barrier is required here because last person to write to tmp_sum[0]
+    // was tid 0
+    if (tid == 0) *sum = tmp_sum[0];
 }
+)";
 
 
-int
-test_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+void generate_random_inputs(std::vector<cl_int> &v)
 {
-    cl_mem            streams[3];
-    cl_int            *input_ptr = NULL, *output_ptr = NULL;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    size_t    local_threads[3];
-    int                err;
-    int                i;
-    size_t max_local_workgroup_size[3];
-    size_t max_threadgroup_size = 0;
-    MTdata d;
+    RandomSeed seed(gRandomSeed);
 
-    err = create_single_kernel_helper(context, &program, &kernel, 1, &barrier_kernel_code, "compute_sum" );
-    test_error(err, "Failed to build kernel/program.");
+    auto random_generator = [&seed]() {
+        return static_cast<cl_int>(
+            get_random_float(-0x01000000, 0x01000000, seed));
+    };
+
+    std::generate(v.begin(), v.end(), random_generator);
+}
+
+int test_barrier_common(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements,
+                        std::string barrier_str)
+{
+    clMemWrapper streams[3];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
 
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                 sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
-    test_error(err, "clGetKernelWorkgroupInfo failed.");
+    cl_int output;
+    int err;
 
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
-    test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+    size_t max_threadgroup_size = 0;
+    std::string build_options = std::string("-DBARRIER=") + barrier_str;
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &barrier_kernel_code, "compute_sum",
+                                      build_options.c_str());
+    test_error(err, "Failed to build kernel/program.");
 
-    // Pick the minimum of the device and the kernel
-    if (max_threadgroup_size > max_local_workgroup_size[0])
-        max_threadgroup_size = max_local_workgroup_size[0];
+    err = get_max_allowed_1d_work_group_size_on_device(device, kernel,
+                                                       &max_threadgroup_size);
+    test_error(err, "get_max_allowed_1d_work_group_size_on_device failed.");
 
     // work group size must divide evenly into the global size
-    while( num_elements % max_threadgroup_size )
-        max_threadgroup_size--;
+    while (num_elements % max_threadgroup_size) max_threadgroup_size--;
 
-    input_ptr = (int*)malloc(sizeof(int) * num_elements);
-    output_ptr = (int*)malloc(sizeof(int));
+    std::vector<cl_int> input(num_elements);
 
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, &err);
+                                sizeof(cl_int) * num_elements, nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
-    streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err);
+    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int),
+                                nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
     streams[2] =
         clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * max_threadgroup_size, NULL, &err);
+                       sizeof(cl_int) * max_threadgroup_size, nullptr, &err);
     test_error(err, "clCreateBuffer failed.");
 
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
-    free_mtdata(d);  d = NULL;
+    generate_random_inputs(input);
 
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                               sizeof(cl_int) * num_elements, input.data(), 0,
+                               nullptr, nullptr);
     test_error(err, "clEnqueueWriteBuffer failed.");
 
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
-    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
+    err = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof(num_elements), &num_elements);
+    err |= clSetKernelArg(kernel, 2, sizeof(streams[2]), &streams[2]);
+    err |= clSetKernelArg(kernel, 3, sizeof(streams[1]), &streams[1]);
     test_error(err, "clSetKernelArg failed.");
 
-    global_threads[0] = max_threadgroup_size;
-    local_threads[0] = max_threadgroup_size;
+    size_t global_threads[] = { max_threadgroup_size };
+    size_t local_threads[] = { max_threadgroup_size };
 
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_threads,
+                                 local_threads, 0, nullptr, nullptr);
     test_error(err, "clEnqueueNDRangeKernel failed.");
 
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
+    err = clEnqueueReadBuffer(queue, streams[1], true, 0, sizeof(cl_int),
+                              &output, 0, nullptr, nullptr);
     test_error(err, "clEnqueueReadBuffer failed.");
 
-        err = verify_sum(input_ptr, output_ptr, num_elements);
-
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(output_ptr);
+    if (std::accumulate(input.begin(), input.end(), 0) != output)
+    {
+        log_error("%s test failed\n", barrier_str.c_str());
+        err = -1;
+    }
+    else
+    {
+        log_info("%s test passed\n", barrier_str.c_str());
+    }
 
     return err;
 }
+}
 
+int test_barrier(cl_device_id device, cl_context context,
+                 cl_command_queue queue, int num_elements)
+{
+    return test_barrier_common(device, context, queue, num_elements, "barrier");
+}
 
-
-
-
+int test_wg_barrier(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int num_elements)
+{
+    return test_barrier_common(device, context, queue, num_elements,
+                               "work_group_barrier");
+}
diff --git a/test_conformance/basic/test_constant.cpp b/test_conformance/basic/test_constant.cpp
index ed25c6ef1..fc2667ee3 100644
--- a/test_conformance/basic/test_constant.cpp
+++ b/test_conformance/basic/test_constant.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,41 +21,44 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <vector>
 
 #include "procs.h"
 
-const char *constant_kernel_code =
-"__kernel void constant_kernel(__global float *out, __constant float *tmpF, __constant int *tmpI)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"\n"
-"    float ftmp = tmpF[tid]; \n"
-"    float Itmp = tmpI[tid]; \n"
-"    out[tid] = ftmp * Itmp; \n"
-"}\n";
-
-const char *loop_constant_kernel_code =
-"kernel void loop_constant_kernel(global float *out, constant float *i_pos, int num)\n"
-"{\n"
-"    int tid = get_global_id(0);\n"
-"    float sum = 0;\n"
-"    for (int i = 0; i < num; i++) {\n"
-"        float  pos  = i_pos[i*3];\n"
-"        sum += pos;\n"
-"    }\n"
-"    out[tid] = sum;\n"
-"}\n";
-
-
-static int
-verify(cl_float *tmpF, cl_int *tmpI, cl_float *out, int n)
+namespace {
+const char* constant_kernel_code = R"(
+__kernel void constant_kernel(__global float *out, __constant float *tmpF, __constant int *tmpI)
+{
+    int  tid = get_global_id(0);
+
+    float ftmp = tmpF[tid];
+    float Itmp = tmpI[tid];
+    out[tid] = ftmp * Itmp;
+}
+)";
+
+const char* loop_constant_kernel_code = R"(
+kernel void loop_constant_kernel(global float *out, constant float *i_pos, int num)
 {
-    int         i;
+    int tid = get_global_id(0);
+    float sum = 0;
+    for (int i = 0; i < num; i++) {
+        float  pos  = i_pos[i*3];
+        sum += pos;
+    }
+    out[tid] = sum;
+}
+)";
+
 
-    for (i=0; i < n; i++)
+int verify(std::vector<cl_float>& tmpF, std::vector<cl_int>& tmpI,
+           std::vector<cl_float>& out)
+{
+    for (int i = 0; i < out.size(); i++)
     {
         float f = tmpF[i] * tmpI[i];
-        if( out[i] != f )
+        if (out[i] != f)
         {
             log_error("CONSTANT test failed\n");
             return -1;
@@ -66,214 +69,172 @@ verify(cl_float *tmpF, cl_int *tmpI, cl_float *out, int n)
     return 0;
 }
 
-
-static int
-verify_loop_constant(const cl_float *tmp, cl_float *out, cl_int l, int n)
+int verify_loop_constant(const std::vector<cl_float>& tmp,
+                         std::vector<cl_float>& out, cl_int l)
 {
-    int i;
-    cl_int j;
-    for (i=0; i < n; i++)
-    {
-        float sum = 0;
-        for (j=0; j < l; ++j)
-            sum += tmp[j*3];
+    float sum = 0;
+    for (int j = 0; j < l; ++j) sum += tmp[j * 3];
 
-        if( out[i] != sum )
-        {
-            log_error("loop CONSTANT test failed\n");
-            return -1;
-        }
+    auto predicate = [&sum](cl_float elem) { return sum != elem; };
+
+    if (std::any_of(out.cbegin(), out.cend(), predicate))
+    {
+        log_error("loop CONSTANT test failed\n");
+        return -1;
     }
 
     log_info("loop CONSTANT test passed\n");
     return 0;
 }
 
-int
-test_constant(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+template <typename T> void generate_random_inputs(std::vector<T>& v)
+{
+    RandomSeed seed(gRandomSeed);
+
+    auto random_generator = [&seed]() {
+        return static_cast<T>(get_random_float(-0x02000000, 0x02000000, seed));
+    };
+
+    std::generate(v.begin(), v.end(), random_generator);
+}
+}
+
+int test_constant(cl_device_id device, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    cl_mem            streams[3];
-    cl_int            *tmpI;
-    cl_float        *tmpF, *out;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    int                err;
-    unsigned int                i;
+    clMemWrapper streams[3];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    size_t global_threads[3];
+    int err;
     cl_ulong maxSize, maxGlobalSize, maxAllocSize;
     size_t num_floats, num_ints, constant_values;
-    MTdata          d;
-    RoundingMode     oldRoundMode;
+    RoundingMode oldRoundMode;
     int isRTZ = 0;
 
-  /* Verify our test buffer won't be bigger than allowed */
-    err = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( err, "Unable to get max constant buffer size" );
-
-  log_info("Device reports CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE %llu bytes.\n", maxSize);
-  
-  // Limit test buffer size to 1/4 of CL_DEVICE_GLOBAL_MEM_SIZE
-  err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalSize), &maxGlobalSize, 0);
-  test_error(err, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
-
-  if (maxSize > maxGlobalSize / 4)
-    maxSize = maxGlobalSize / 4;
-
-  err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(maxAllocSize), &maxAllocSize, 0);
-  test_error(err, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE ");
-
-  if (maxSize > maxAllocSize)
-    maxSize = maxAllocSize;
-  
-  maxSize/=4;
-  num_ints = (size_t)maxSize/sizeof(cl_int);
-  num_floats = (size_t)maxSize/sizeof(cl_float);
-  if (num_ints >= num_floats) {
-    constant_values = num_floats;
-  } else {
-    constant_values = num_ints;
-  }
-
-  log_info("Test will attempt to use %lu bytes with one %lu byte constant int buffer and one %lu byte constant float buffer.\n",
-           constant_values*sizeof(cl_int) + constant_values*sizeof(cl_float), constant_values*sizeof(cl_int), constant_values*sizeof(cl_float));
-
-    tmpI = (cl_int*)malloc(sizeof(cl_int) * constant_values);
-    tmpF = (cl_float*)malloc(sizeof(cl_float) * constant_values);
-    out  = (cl_float*)malloc(sizeof(cl_float) * constant_values);
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_float) * constant_values, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_float) * constant_values, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * constant_values, NULL, NULL);
-    if (!streams[2])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
+    /* Verify our test buffer won't be bigger than allowed */
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                          sizeof(maxSize), &maxSize, 0);
+    test_error(err, "Unable to get max constant buffer size");
+    log_info("Device reports CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE %llu bytes.\n",
+             maxSize);
 
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<constant_values; i++) {
-        tmpI[i] = (int)get_random_float(-0x02000000, 0x02000000, d);
-        tmpF[i] = get_random_float(-0x02000000, 0x02000000, d);
-    }
-    free_mtdata(d); d = NULL;
+    // Limit test buffer size to 1/4 of CL_DEVICE_GLOBAL_MEM_SIZE
+    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE,
+                          sizeof(maxGlobalSize), &maxGlobalSize, 0);
+    test_error(err, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
 
-    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)tmpF, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
-  err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, sizeof(cl_int)*constant_values, (void *)tmpI, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteArray failed\n");
-        return -1;
-    }
+    maxSize = std::min(maxSize, maxGlobalSize / 4);
+
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                          sizeof(maxAllocSize), &maxAllocSize, 0);
+    test_error(err, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE");
+
+    maxSize = std::min(maxSize, maxAllocSize);
+
+    maxSize /= 4;
+    num_ints = static_cast<size_t>(maxSize / sizeof(cl_int));
+    num_floats = static_cast<size_t>(maxSize / sizeof(cl_float));
+    constant_values = std::min(num_floats, num_ints);
+
+
+    log_info(
+        "Test will attempt to use %lu bytes with one %lu byte constant int "
+        "buffer and one %lu byte constant float buffer.\n",
+        constant_values * sizeof(cl_int) + constant_values * sizeof(cl_float),
+        constant_values * sizeof(cl_int), constant_values * sizeof(cl_float));
+
+    std::vector<cl_int> tmpI(constant_values);
+    std::vector<cl_float> tmpF(constant_values);
+    std::vector<cl_float> out(constant_values);
+
+
+    streams[0] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_float) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
 
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &constant_kernel_code, "constant_kernel" );
-    if (err) {
-    log_error("Failed to create kernel and program: %d\n", err);
-    return -1;
-  }
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_float) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    streams[2] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(cl_int) * constant_values, nullptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    generate_random_inputs(tmpI);
+    generate_random_inputs(tmpF);
+
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0,
+                               sizeof(cl_float) * constant_values, tmpF.data(),
+                               0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0,
+                               sizeof(cl_int) * constant_values, tmpI.data(), 0,
+                               nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer faile.");
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &constant_kernel_code, "constant_kernel");
+    test_error(err, "Failed to create kernel and program");
 
 
     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
     err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
-    }
+    test_error(err, "clSetKernelArgs failed");
 
     global_threads[0] = constant_values;
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
-        return -1;
-    }
-    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_threads,
+                                 nullptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed");
+
+    err = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                              sizeof(cl_float) * constant_values, out.data(), 0,
+                              nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed");
 
-    //If we only support rtz mode
-    if( CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device) && gIsEmbedded)
+    // If we only support rtz mode
+    if (CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device) && gIsEmbedded)
     {
         oldRoundMode = set_round(kRoundTowardZero, kfloat);
         isRTZ = 1;
     }
 
-    err = verify(tmpF, tmpI, out, (int)constant_values);
+    err = verify(tmpF, tmpI, out);
 
-    if (isRTZ)
-        (void)set_round(oldRoundMode, kfloat);
+    if (isRTZ) (void)set_round(oldRoundMode, kfloat);
 
     // Loop constant buffer test
-    cl_program loop_program;
-    cl_kernel  loop_kernel;
+    clProgramWrapper loop_program;
+    clKernelWrapper loop_kernel;
     cl_int limit = 2;
 
-    memset(out, 0, sizeof(cl_float) * constant_values);
+    memset(out.data(), 0, sizeof(cl_float) * constant_values);
     err = create_single_kernel_helper(context, &loop_program, &loop_kernel, 1,
-                                      &loop_constant_kernel_code, "loop_constant_kernel" );
-    if (err) {
-        log_error("Failed to create loop kernel and program: %d\n", err);
-        return -1;
-    }
+                                      &loop_constant_kernel_code,
+                                      "loop_constant_kernel");
+    test_error(err, "Failed to create kernel and program");
 
     err = clSetKernelArg(loop_kernel, 0, sizeof streams[0], &streams[0]);
     err |= clSetKernelArg(loop_kernel, 1, sizeof streams[1], &streams[1]);
     err |= clSetKernelArg(loop_kernel, 2, sizeof(limit), &limit);
-    if (err != CL_SUCCESS) {
-        log_error("clSetKernelArgs for loop kernel failed\n");
-        return -1;
-    }
+    test_error(err, "clSetKernelArgs failed");
 
-    err = clEnqueueNDRangeKernel( queue, loop_kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS) {
-        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
-        return -1;
-    }
-    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
-    if (err != CL_SUCCESS) {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
-    }
+    err = clEnqueueNDRangeKernel(queue, loop_kernel, 1, nullptr, global_threads,
+                                 nullptr, 0, nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed");
 
-    err = verify_loop_constant(tmpF, out, limit, (int)constant_values);
+    err = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                              sizeof(cl_float) * constant_values, out.data(), 0,
+                              nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed");
+
+    err = verify_loop_constant(tmpF, out, limit);
 
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    clReleaseKernel(loop_kernel);
-    clReleaseProgram(loop_program);
-    free(tmpI);
-    free(tmpF);
-    free(out);
 
     return err;
 }
-
-
-
-
-
diff --git a/test_conformance/basic/test_enqueue_map.cpp b/test_conformance/basic/test_enqueue_map.cpp
index d28f7e41a..6b650c0d8 100644
--- a/test_conformance/basic/test_enqueue_map.cpp
+++ b/test_conformance/basic/test_enqueue_map.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -26,6 +26,7 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
+// clang-format off
 const cl_mem_flags flag_set[] = {
   CL_MEM_ALLOC_HOST_PTR,
   CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
@@ -33,93 +34,104 @@ const cl_mem_flags flag_set[] = {
   CL_MEM_COPY_HOST_PTR,
   0
 };
-const char* flag_set_names[] = {
+
+const char *flag_set_names[] = {
   "CL_MEM_ALLOC_HOST_PTR",
   "CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR",
   "CL_MEM_USE_HOST_PTR",
   "CL_MEM_COPY_HOST_PTR",
   "0"
 };
+// clang-format on
 
-int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
 {
     int error;
-    const size_t bufferSize = 256*256;
-    MTdataHolder d{gRandomSeed};
+    const size_t bufferSize = 256 * 256;
+    MTdataHolder d{ gRandomSeed };
     BufferOwningPtr<cl_char> hostPtrData{ malloc(bufferSize) };
     BufferOwningPtr<cl_char> referenceData{ malloc(bufferSize) };
-    BufferOwningPtr<cl_char> finalData{malloc(bufferSize)};
+    BufferOwningPtr<cl_char> finalData{ malloc(bufferSize) };
 
-    for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
+    for (int src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
     {
         clMemWrapper memObject;
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
 
         generate_random_data(kChar, (unsigned int)bufferSize, d, hostPtrData);
         memcpy(referenceData, hostPtrData, bufferSize);
 
         void *hostPtr = nullptr;
         cl_mem_flags flags = flag_set[src_flag_id];
-        bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
         if (hasHostPtr) hostPtr = hostPtrData;
-        memObject = clCreateBuffer(context, flags,  bufferSize, hostPtr, &error);
-        test_error( error, "Unable to create testing buffer" );
+        memObject = clCreateBuffer(context, flags, bufferSize, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
         if (!hasHostPtr)
         {
             error =
-            clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
-                                 hostPtrData, 0, NULL, NULL);
-            test_error( error, "clEnqueueWriteBuffer failed");
+                clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                     hostPtrData, 0, NULL, NULL);
+            test_error(error, "clEnqueueWriteBuffer failed");
         }
 
-        for( int i = 0; i < 128; i++ )
+        for (int i = 0; i < 128; i++)
         {
 
-          size_t offset = (size_t)random_in_range( 0, (int)bufferSize - 1, d );
-          size_t length = (size_t)random_in_range( 1, (int)( bufferSize - offset ), d );
-
-          cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                                offset, length, 0, NULL, NULL, &error );
-          if( error != CL_SUCCESS )
-          {
-            print_error( error, "clEnqueueMapBuffer call failed" );
-            log_error( "\tOffset: %d  Length: %d\n", (int)offset, (int)length );
-            return -1;
-          }
-
-          // Write into the region
-          for( size_t j = 0; j < length; j++ )
-          {
-            cl_char spin = (cl_char)genrand_int32( d );
-
-            // Test read AND write in one swipe
-            cl_char value = mappedRegion[ j ];
-            value = spin - value;
-            mappedRegion[ j ] = value;
-
-            // Also update the initial data array
-            value = referenceData[offset + j];
-            value = spin - value;
-            referenceData[offset + j] = value;
-          }
-
-          // Unmap
-          error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-          test_error( error, "Unable to unmap buffer" );
+            size_t offset = (size_t)random_in_range(0, (int)bufferSize - 1, d);
+            size_t length =
+                (size_t)random_in_range(1, (int)(bufferSize - offset), d);
+
+            cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                length, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapBuffer call failed");
+                log_error("\tOffset: %d  Length: %d\n", (int)offset,
+                          (int)length);
+                return -1;
+            }
+
+            // Write into the region
+            for (size_t j = 0; j < length; j++)
+            {
+                cl_char spin = (cl_char)genrand_int32(d);
+
+                // Test read AND write in one swipe
+                cl_char value = mappedRegion[j];
+                value = spin - value;
+                mappedRegion[j] = value;
+
+                // Also update the initial data array
+                value = referenceData[offset + j];
+                value = spin - value;
+                referenceData[offset + j] = value;
+            }
+
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
         }
 
-        // Final validation: read actual values of buffer and compare against our reference
-        error = clEnqueueReadBuffer( queue, memObject, CL_TRUE, 0, bufferSize, finalData, 0, NULL, NULL );
-        test_error( error, "Unable to read results" );
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        error = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, bufferSize,
+                                    finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
 
-        for( size_t q = 0; q < bufferSize; q++ )
+        for (size_t q = 0; q < bufferSize; q++)
         {
             if (referenceData[q] != finalData[q])
             {
                 log_error(
-                "ERROR: Sample %d did not validate! Got %d, expected %d\n",
-                (int)q, (int)finalData[q], (int)referenceData[q]);
+                    "ERROR: Sample %d did not validate! Got %d, expected %d\n",
+                    (int)q, (int)finalData[q], (int)referenceData[q]);
                 return -1;
             }
         }
@@ -128,112 +140,128 @@ int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_comman
     return 0;
 }
 
-int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_enqueue_map_image(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
 {
     int error;
     cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT32 };
     const size_t imageSize = 256;
     const size_t imageDataSize = imageSize * imageSize * 4 * sizeof(cl_uint);
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     BufferOwningPtr<cl_uint> hostPtrData{ malloc(imageDataSize) };
     BufferOwningPtr<cl_uint> referenceData{ malloc(imageDataSize) };
-    BufferOwningPtr<cl_uint> finalData{malloc(imageDataSize)};
-
-    MTdataHolder d{gRandomSeed};
-  for (int src_flag_id=0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++) {
-    clMemWrapper memObject;
-    log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
-
-    generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4), d,
-                         hostPtrData);
-    memcpy(referenceData, hostPtrData, imageDataSize);
-
-    cl_mem_flags flags = flag_set[src_flag_id];
-    bool hasHostPtr = (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
-    void *hostPtr = nullptr;
-    if (hasHostPtr) hostPtr = hostPtrData;
-    memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
-                                imageSize, imageSize, 0, hostPtr, &error );
-    test_error( error, "Unable to create testing buffer" );
-
-    if (!hasHostPtr) {
-      size_t write_origin[3]={0,0,0}, write_region[3]={imageSize, imageSize, 1};
-      error =
-      clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin, write_region,
-                          0, 0, hostPtrData, 0, NULL, NULL);
-      test_error( error, "Unable to write to testing buffer" );
-    }
-
-    for( int i = 0; i < 128; i++ )
+    BufferOwningPtr<cl_uint> finalData{ malloc(imageDataSize) };
+
+    MTdataHolder d{ gRandomSeed };
+    for (int src_flag_id = 0; src_flag_id < ARRAY_SIZE(flag_set); src_flag_id++)
     {
+        clMemWrapper memObject;
+        log_info("Testing with cl_mem_flags src: %s\n",
+                 flag_set_names[src_flag_id]);
+
+        generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4),
+                             d, hostPtrData);
+        memcpy(referenceData, hostPtrData, imageDataSize);
+
+        cl_mem_flags flags = flag_set[src_flag_id];
+        bool hasHostPtr =
+            (flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR);
+        void *hostPtr = nullptr;
+        if (hasHostPtr) hostPtr = hostPtrData;
+        memObject = create_image_2d(context, CL_MEM_READ_WRITE | flags, &format,
+                                    imageSize, imageSize, 0, hostPtr, &error);
+        test_error(error, "Unable to create testing buffer");
 
-      size_t offset[3], region[3];
-      size_t rowPitch;
-
-      offset[ 0 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 0 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 0 ] - 1), d );
-      offset[ 1 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
-      region[ 1 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 1 ] - 1), d );
-      offset[ 2 ] = 0;
-      region[ 2 ] = 1;
-      cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                           offset, region, &rowPitch, NULL, 0, NULL, NULL, &error );
-      if( error != CL_SUCCESS )
-      {
-        print_error( error, "clEnqueueMapImage call failed" );
-        log_error( "\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0], (int)offset[1], (int)region[0], (int)region[1] );
-        return -1;
-      }
-
-      // Write into the region
-      cl_uint *mappedPtr = mappedRegion;
-      for( size_t y = 0; y < region[ 1 ]; y++ )
-      {
-        for( size_t x = 0; x < region[ 0 ] * 4; x++ )
+        if (!hasHostPtr)
         {
-          cl_int spin = (cl_int)random_in_range( 16, 1024, d );
-
-          cl_int value;
-          // Test read AND write in one swipe
-          value = mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ];
-          value = spin - value;
-          mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ] = value;
-
-          // Also update the initial data array
-          value =
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x];
-          value = spin - value;
-          referenceData[((offset[1] + y) * imageSize + offset[0]) * 4 + x] =
-          value;
+            size_t write_origin[3] = { 0, 0, 0 },
+                   write_region[3] = { imageSize, imageSize, 1 };
+            error = clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin,
+                                        write_region, 0, 0, hostPtrData, 0,
+                                        NULL, NULL);
+            test_error(error, "Unable to write to testing buffer");
         }
-      }
 
-      // Unmap
-      error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
-      test_error( error, "Unable to unmap buffer" );
-    }
+        for (int i = 0; i < 128; i++)
+        {
 
-    // Final validation: read actual values of buffer and compare against our reference
-    size_t finalOrigin[3] = { 0, 0, 0 }, finalRegion[3] = { imageSize, imageSize, 1 };
-    error = clEnqueueReadImage( queue, memObject, CL_TRUE, finalOrigin, finalRegion, 0, 0, finalData, 0, NULL, NULL );
-    test_error( error, "Unable to read results" );
+            size_t offset[3], region[3];
+            size_t rowPitch;
+
+            offset[0] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[0] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[0] - 1), d);
+            offset[1] = (size_t)random_in_range(0, (int)imageSize - 1, d);
+            region[1] =
+                (size_t)random_in_range(1, (int)(imageSize - offset[1] - 1), d);
+            offset[2] = 0;
+            region[2] = 1;
+            cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage(
+                queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+                region, &rowPitch, NULL, 0, NULL, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                print_error(error, "clEnqueueMapImage call failed");
+                log_error("\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0],
+                          (int)offset[1], (int)region[0], (int)region[1]);
+                return -1;
+            }
 
-    for( size_t q = 0; q < imageSize * imageSize * 4; q++ )
-    {
-        if (referenceData[q] != finalData[q])
+            // Write into the region
+            cl_uint *mappedPtr = mappedRegion;
+            for (size_t y = 0; y < region[1]; y++)
+            {
+                for (size_t x = 0; x < region[0] * 4; x++)
+                {
+                    cl_int spin = (cl_int)random_in_range(16, 1024, d);
+
+                    cl_int value;
+                    // Test read AND write in one swipe
+                    value = mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x];
+                    value = spin - value;
+                    mappedPtr[(y * rowPitch / sizeof(cl_uint)) + x] = value;
+
+                    // Also update the initial data array
+                    value =
+                        referenceData[((offset[1] + y) * imageSize + offset[0])
+                                          * 4
+                                      + x];
+                    value = spin - value;
+                    referenceData[((offset[1] + y) * imageSize + offset[0]) * 4
+                                  + x] = value;
+                }
+            }
+
+            // Unmap
+            error = clEnqueueUnmapMemObject(queue, memObject, mappedRegion, 0,
+                                            NULL, NULL);
+            test_error(error, "Unable to unmap buffer");
+        }
+
+        // Final validation: read actual values of buffer and compare against
+        // our reference
+        size_t finalOrigin[3] = { 0, 0, 0 },
+               finalRegion[3] = { imageSize, imageSize, 1 };
+        error = clEnqueueReadImage(queue, memObject, CL_TRUE, finalOrigin,
+                                   finalRegion, 0, 0, finalData, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+
+        for (size_t q = 0; q < imageSize * imageSize * 4; q++)
         {
-            log_error("ERROR: Sample %d (coord %d,%d) did not validate! Got "
-                      "%d, expected %d\n",
-                      (int)q, (int)((q / 4) % imageSize),
-                      (int)((q / 4) / imageSize), (int)finalData[q],
-                      (int)referenceData[q]);
-            return -1;
+            if (referenceData[q] != finalData[q])
+            {
+                log_error(
+                    "ERROR: Sample %d (coord %d,%d) did not validate! Got "
+                    "%d, expected %d\n",
+                    (int)q, (int)((q / 4) % imageSize),
+                    (int)((q / 4) / imageSize), (int)finalData[q],
+                    (int)referenceData[q]);
+                return -1;
+            }
         }
-    }
-  } // cl_mem_flags
+    } // cl_mem_flags
 
     return 0;
 }
-
diff --git a/test_conformance/basic/test_image_r8.cpp b/test_conformance/basic/test_image_r8.cpp
index b633d6abb..2dca1611e 100644
--- a/test_conformance/basic/test_image_r8.cpp
+++ b/test_conformance/basic/test_image_r8.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,163 +21,111 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+#include <vector>
 
 #include "procs.h"
 
-static const char *r_uint8_kernel_code =
-"__kernel void test_r_uint8(read_only image2d_t srcimg, __global unsigned char *dst, sampler_t sampler)\n"
-"{\n"
-"    int    tid_x = get_global_id(0);\n"
-"    int    tid_y = get_global_id(1);\n"
-"    int    indx = tid_y * get_image_width(srcimg) + tid_x;\n"
-"    uint4    color;\n"
-"\n"
-"    color = read_imageui(srcimg, sampler, (int2)(tid_x, tid_y));\n"
-"    dst[indx] = (unsigned char)(color.x);\n"
-"\n"
-"}\n";
-
-
-static unsigned char *
-generate_8bit_image(int w, int h, MTdata d)
+namespace {
+const char *r_uint8_kernel_code = R"(
+__kernel void test_r_uint8(read_only image2d_t srcimg, __global unsigned char *dst, sampler_t sampler)
 {
-    unsigned char    *ptr = (unsigned char*)malloc(w * h * sizeof(unsigned char));
-    int             i;
+    int    tid_x = get_global_id(0);
+    int    tid_y = get_global_id(1);
+    int    indx = tid_y * get_image_width(srcimg) + tid_x;
+    uint4  color;
 
-    for (i=0; i<w*h; i++)
-      ptr[i] = (unsigned char)genrand_int32(d);
+    color = read_imageui(srcimg, sampler, (int2)(tid_x, tid_y));
+    dst[indx] = (unsigned char)(color.x);
+})";
 
-    return ptr;
-}
 
-static int
-verify_8bit_image(unsigned char *image, unsigned char *outptr, int w, int h)
+void generate_random_inputs(std::vector<cl_uchar> &v)
 {
-    int     i;
+    RandomSeed seed(gRandomSeed);
 
-    for (i=0; i<w*h; i++)
-    {
-        if (outptr[i] != image[i])
-        {
-            log_error("READ_IMAGE_R_UNSIGNED_INT8 test failed\n");
-            return -1;
-        }
-    }
+    auto random_generator = [&seed]() {
+        return static_cast<cl_uchar>(genrand_int32(seed));
+    };
 
-    log_info("READ_IMAGE_R_UNSIGNED_INT8 test passed\n");
-    return 0;
+    std::generate(v.begin(), v.end(), random_generator);
 }
 
-int
-test_image_r8(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+}
+int test_image_r8(cl_device_id device, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    cl_mem            streams[2];
-    cl_image_format    img_format;
-    cl_uchar    *input_ptr, *output_ptr;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    threads[3];
-    int                img_width = 512;
-    int                img_height = 512;
-    int                err;
-    MTdata          d;
-
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( device )
-
-    img_format.image_channel_order = CL_R;
-    img_format.image_channel_data_type = CL_UNSIGNED_INT8;
+    clMemWrapper streams[2];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    const size_t img_width = 512;
+    const size_t img_height = 512;
+    const size_t length = img_width * img_height;
+    int err;
+
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(device)
+
+    const cl_image_format img_format = { CL_R, CL_UNSIGNED_INT8 };
 
     // early out if this image type is not supported
     if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
                                    CL_MEM_OBJECT_IMAGE2D, &img_format))
     {
         log_info("WARNING: Image type not supported; skipping test.\n");
-        return 0;
+        return TEST_SKIPPED_ITSELF;
     }
 
-    d = init_genrand( gRandomSeed );
-    input_ptr = generate_8bit_image(img_width, img_height, d);
-    free_mtdata(d); d = NULL;
+    std::vector<cl_uchar> input(length);
+    std::vector<cl_uchar> output(length);
+
+    generate_random_inputs(input);
 
-    output_ptr = (cl_uchar*)malloc(sizeof(cl_uchar) * img_width * img_height);
     streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &img_format,
-                                 img_width, img_height, 0, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("create_image_2d failed\n");
-        return -1;
-    }
+                                 img_width, img_height, 0, nullptr, &err);
+    test_error(err, "create_image_2d failed.");
 
     streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uchar) * img_width * img_height, NULL, NULL);
-    if (!streams[1])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
+        clCreateBuffer(context, CL_MEM_READ_WRITE, length, nullptr, &err);
+    test_error(err, "clCreateBuffer failed.");
 
-    size_t origin[3] = {0,0,0}, region[3]={img_width, img_height, 1};
-    err = clEnqueueWriteImage(queue, streams[0], CL_TRUE,
-                            origin, region, 0, 0,
-                            input_ptr,
-                            0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clWriteImage failed: %d\n", err);
-        return -1;
-    }
+    const size_t origin[3] = { 0, 0, 0 },
+                 region[3] = { img_width, img_height, 1 };
+    err = clEnqueueWriteImage(queue, streams[0], CL_TRUE, origin, region, 0, 0,
+                              input.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteImage failed.");
 
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &r_uint8_kernel_code, "test_r_uint8" );
-    if (err) {
-    log_error("Failed to create kernel and program: %d\n", err);
-    return -1;
-  }
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &r_uint8_kernel_code, "test_r_uint8");
+    test_error(err, "create_single_kernel_helper failed.");
 
-  cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
-  test_error(err, "clCreateSampler failed");
+    clSamplerWrapper sampler = clCreateSampler(
+        context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
+    test_error(err, "clCreateSampler failed");
+
+    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler);
+    test_error(err, "clSetKernelArgs failed\n");
+
+    size_t threads[] = { img_width, img_height };
+    err = clEnqueueNDRangeKernel(queue, kernel, 2, nullptr, threads, nullptr, 0,
+                                 nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed\n");
 
-  err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-  err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-  err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clSetKernelArgs failed: %d\n", err);
-        return -1;
-    }
 
-    threads[0] = (size_t)img_width;
-    threads[1] = (size_t)img_height;
-    err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
+    err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, length,
+                              output.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed\n");
+
+    if (0 != memcmp(input.data(), output.data(), length))
     {
-        log_error("clEnqueueNDRangeKernel failed\n");
-        return -1;
+        log_error("READ_IMAGE_R_UNSIGNED_INT8 test failed\n");
+        err = -1;
     }
-
-    err = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_uchar)*img_width*img_height, (void *)output_ptr, 0, NULL, NULL );
-    if (err != CL_SUCCESS)
+    else
     {
-        log_error("clEnqueueReadBuffer failed\n");
-        return -1;
+        log_info("READ_IMAGE_R_UNSIGNED_INT8 test passed\n");
     }
 
-    err = verify_8bit_image(input_ptr, output_ptr, img_width, img_height);
-
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    clReleaseSampler(sampler);
-    free(input_ptr);
-    free(output_ptr);
-
     return err;
 }
-
-
-
-
-
diff --git a/test_conformance/basic/test_int2fp.cpp b/test_conformance/basic/test_int2fp.cpp
index 8b1203a71..dd5cc9a18 100644
--- a/test_conformance/basic/test_int2fp.cpp
+++ b/test_conformance/basic/test_int2fp.cpp
@@ -25,6 +25,7 @@
 #include <sys/stat.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <map>
 #include <vector>
 
diff --git a/test_conformance/basic/test_loop.cpp b/test_conformance/basic/test_loop.cpp
index 1a91d9e4d..1c9acd1ad 100644
--- a/test_conformance/basic/test_loop.cpp
+++ b/test_conformance/basic/test_loop.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,45 +21,45 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <vector>
 
 #include "procs.h"
 
-const char *loop_kernel_code =
-"__kernel void test_loop(__global int *src, __global int *loopindx, __global int *loopcnt, __global int *dst)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"    int  n = get_global_size(0);\n"
-"    int  i, j;\n"
-"\n"
-"    dst[tid] = 0;\n"
-"    for (i=0,j=loopindx[tid]; i<loopcnt[tid]; i++,j++)\n"
-"    {\n"
-"        if (j >= n)\n"
-"            j = 0;\n"
-"        dst[tid] += src[j];\n"
-"    }\n"
-"\n"
-"}\n";
-
-
-int
-verify_loop(int *inptr, int *loopindx, int *loopcnt, int *outptr, int n)
+namespace {
+const char *loop_kernel_code = R"(
+__kernel void test_loop(__global int *src, __global int *loopindx, __global int *loopcnt, __global int *dst)
 {
-    int     r, i, j, k;
+    int  tid = get_global_id(0);
+    int  n = get_global_size(0);
+    int  i, j;
 
-    for (i=0; i<n; i++)
+    dst[tid] = 0;
+    for (i=0, j=loopindx[tid]; i<loopcnt[tid]; i++, j++)
     {
-        r = 0;
-        for (j=0,k=loopindx[i]; j<loopcnt[i]; j++,k++)
+        if (j >= n)
+            j = 0;
+        dst[tid] += src[j];
+    }
+}
+)";
+
+
+int verify_loop(std::vector<cl_int> inptr, std::vector<cl_int> loopindx,
+                std::vector<cl_int> loopcnt, std::vector<cl_int> outptr, int n)
+{
+    for (int i = 0; i < n; i++)
+    {
+        int r = 0;
+        for (int j = 0, k = loopindx[i]; j < loopcnt[i]; j++, k++)
         {
-            if (k >= n)
-                k = 0;
+            if (k >= n) k = 0;
             r += inptr[k];
         }
 
         if (r != outptr[i])
         {
-            log_error("LOOP test failed: %d found, expected %d\n", outptr[i], r);
+            log_error("LOOP test failed: %d found, expected %d\n", outptr[i],
+                      r);
             return -1;
         }
     }
@@ -67,119 +67,69 @@ verify_loop(int *inptr, int *loopindx, int *loopcnt, int *outptr, int n)
     log_info("LOOP test passed\n");
     return 0;
 }
-
-int test_loop(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+}
+int test_loop(cl_device_id device, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    cl_mem streams[4];
-    cl_int *input_ptr, *loop_indx, *loop_cnt, *output_ptr;
-    cl_program program;
-    cl_kernel kernel;
-    size_t threads[1];
-    int err, i;
+    clMemWrapper streams[4];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    int err;
 
     size_t length = sizeof(cl_int) * num_elements;
-    input_ptr  = (cl_int*)malloc(length);
-    loop_indx  = (cl_int*)malloc(length);
-    loop_cnt   = (cl_int*)malloc(length);
-    output_ptr = (cl_int*)malloc(length);
+    std::vector<cl_int> input(length);
+    std::vector<cl_int> loop_indx(length);
+    std::vector<cl_int> loop_cnt(length);
+    std::vector<cl_int> output(length);
 
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[0])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[1])
+    for (auto &stream : streams)
     {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[2])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
-    }
-    streams[3] = clCreateBuffer(context, CL_MEM_READ_WRITE, length, NULL, NULL);
-    if (!streams[3])
-    {
-        log_error("clCreateBuffer failed\n");
-        return -1;
+        stream =
+            clCreateBuffer(context, CL_MEM_READ_WRITE, length, nullptr, &err);
+        test_error(err, "clCreateBuffer failed.");
     }
 
-    MTdata d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    RandomSeed seed(gRandomSeed);
+    for (int i = 0; i < num_elements; i++)
     {
-        input_ptr[i] = (int)genrand_int32(d);
-        loop_indx[i] = (int)get_random_float(0, num_elements-1, d);
-        loop_cnt[i] = (int)get_random_float(0, num_elements/32, d);
-    }
-    free_mtdata(d); d = NULL;
-
-  err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
-  err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, loop_indx, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
-  err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length, loop_cnt, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueWriteBuffer failed\n");
-    return -1;
-  }
-
-  err = create_single_kernel_helper(context, &program, &kernel, 1, &loop_kernel_code, "test_loop" );
-  if (err)
-    return -1;
-
-  err  = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-  err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
-  err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-  err |= clSetKernelArg(kernel, 3, sizeof streams[3], &streams[3]);
-    if (err != CL_SUCCESS)
+        input[i] = static_cast<int>(genrand_int32(seed));
+        loop_indx[i] =
+            static_cast<int>(get_random_float(0, num_elements - 1, seed));
+        loop_cnt[i] =
+            static_cast<int>(get_random_float(0, num_elements / 32, seed));
+    };
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length,
+                               input.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length,
+                               loop_indx.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length,
+                               loop_cnt.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &loop_kernel_code, "test_loop");
+    test_error(err, "create_single_kernel_helper failed.");
+
+    for (int i = 0; i < ARRAY_SIZE(streams); i++)
     {
-        log_error("clSetKernelArgs failed\n");
-        return -1;
+        err = clSetKernelArg(kernel, i, sizeof streams[i], &streams[i]);
+        test_error(err, "clSetKernelArgs failed\n");
     }
 
-    threads[0] = (unsigned int)num_elements;
-  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clEnqueueNDRangeKernel failed\n");
-    return -1;
-  }
-
-  err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
-  if (err != CL_SUCCESS)
-  {
-    log_error("clReadArray failed\n");
-    return -1;
-  }
-
-  err = verify_loop(input_ptr, loop_indx, loop_cnt, output_ptr, num_elements);
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseMemObject(streams[3]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(loop_indx);
-    free(loop_cnt);
-    free(output_ptr);
+    size_t threads[] = { (size_t)num_elements };
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, threads, nullptr, 0,
+                                 nullptr, nullptr);
+    test_error(err, "clEnqueueNDRangeKernel failed\n");
 
-    return err;
-}
+    err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length,
+                              output.data(), 0, nullptr, nullptr);
+    test_error(err, "clEnqueueReadBuffer failed\n");
+
+    err = verify_loop(input, loop_indx, loop_cnt, output, num_elements);
 
 
+    return err;
+}
diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index e137f9e73..d34ecbf90 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -13,52 +13,129 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
-
+#include <algorithm>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <vector>
 
+#include <CL/cl_half.h>
 
 #include "procs.h"
 #include "harness/conversions.h"
-#include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
 
 // Outputs debug information for stores
 #define DEBUG 0
 // Forces stores/loads to be done with offsets = tid
 #define LINEAR_OFFSETS 0
 #define NUM_LOADS    512
-
-static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+char pragma_str[128] = { 0 };
+char mem_type[64] = { 0 };
+char store_str[128] = { 0 };
+char load_str[128] = { 0 };
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+// clang-format off
+static const char *store_pattern= "results[ tid ] = tmp;\n";
+static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
+static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
+static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
+static const char *kernel_pattern[] = {
+pragma_str,
+"#define STYPE %s\n"
+"__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"
+};
+
+const char *pattern_local [] = {
+pragma_str,
+"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"   int lid = get_local_id( 0 );\n"
+"\n"
+"    if( lid == 0 )\n"
+"    {\n"
+"        for( int i = 0; i < %d; i++ ) {\n"
+"           ", load_str,
+"        }\n"
+"    }\n"
+//  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
+//  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
+//  updated on all threads at that point
+"   barrier( CLK_LOCAL_MEM_FENCE );\n"
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n" };
+
+const char *pattern_priv [] = {
+pragma_str,
+// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
+// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
+"#define PRIV_TYPE %s\n"
+"#define PRIV_SIZE %d\n"
+"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
+"    int tid = get_global_id( 0 );\n"
+"\n"
+"    for( int i = 0; i < PRIV_SIZE; i++ )\n"
+"      sPrivateStorage[ i ] = src[ i ];\n"
+//    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
+//  anybody else to sync up
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"};
+// clang-format on
 
 #pragma mark -------------------- vload harness --------------------------
 
-typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
+typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
+                                  size_t);
+typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
+                       unsigned int, create_program_fn, size_t);
 
-int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-               create_vload_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
+               ExplicitType type, unsigned int vecSize,
+               create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 4 ];
+    MTdataHolder d(gRandomSeed);
     const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
 
     if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
     size_t threads[ 1 ], localThreads[ 1 ];
     clProtectedArray inBuffer( bufferSize );
-    char programSrc[ 10240 ];
     cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
     size_t numElements, typeSize, i;
     unsigned int outVectorSize;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
 
     typeSize = get_explicit_type_size( type );
     numElements = bufferSize / ( typeSize * vecSize );
@@ -83,25 +160,19 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     outVectorSize = vecSize;
 
     // Declare output buffers now
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char outBuffer[ numLoads * typeSize * outVectorSize ];
-    char referenceBuffer[ numLoads * typeSize * vecSize ];
-#else
-    char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
-    char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
+    std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
 
     // Create the program
-
-
+    std::string programSrc;
     createFn( programSrc, numElements, type, vecSize, outVectorSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -115,7 +186,9 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
     test_error( error, "Unable to create kernel stream" );
-    streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
+    streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                numLoads * typeSize * outVectorSize,
+                                (void *)outBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
 
     // Set parameters and run
@@ -145,28 +218,32 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     test_error( error, "Unable to exec kernel" );
 
     // Get the results
-    error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+                                numLoads * typeSize * outVectorSize
+                                    * sizeof(cl_char),
+                                (void *)outBuffer.data(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
-
     // Create the reference results
-    memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
+    referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
     for( i = 0; i < numLoads; i++ )
     {
-        memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
-               typeSize * vecSize );
+        memcpy(&referenceBuffer[i * typeSize * vecSize],
+               ((char *)(void *)inBuffer)
+                   + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
+               typeSize * vecSize);
     }
 
     // Validate the results now
-    char *expected = referenceBuffer;
-    char *actual = outBuffer;
+    char *expected = referenceBuffer.data();
+    char *actual = outBuffer.data();
     char *in = (char *)(void *)inBuffer;
 
     if (DEBUG) {
         log_info("Memory contents:\n");
+        char inString[1024];
+        char expectedString[1024], actualString[1024];
         for (i=0; i<numElements; i++) {
-            char  inString[1024];
-            char expectedString[ 1024 ], actualString[ 1024 ];
             if (i < numLoads) {
                 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
@@ -197,35 +274,42 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
         expected += typeSize * vecSize;
         actual += typeSize * outVectorSize;
     }
-
     return 0;
 }
 
-int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
+template <test_fn test_func_ptr>
+int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
+              create_program_fn createFn, size_t bufferSize)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                          kInt,   kUInt,  kLong,  kULong,
+                                          kFloat, kHalf,  kDouble };
     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
     const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
     int error = 0;
-    MTdata mtData = init_genrand( gRandomSeed );
 
     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
 
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
-    {
+    bool hasDouble = is_extension_available(device, "cl_khr_fp64");
+    bool hasHalf = is_extension_available(device, "cl_khr_fp16");
 
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
+    for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
+    {
+        if (vecType[typeIdx] == kDouble && !hasDouble)
             continue;
-
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
+        else if (vecType[typeIdx] == kHalf && !hasHalf)
+            continue;
+        else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
+                 && !gHasLong)
             continue;
 
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
+        for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
         {
             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
 
-            int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
+            int error_this_type =
+                test_func_ptr(device, context, queue, vecType[typeIdx],
+                              vecSizes[sizeIdx], createFn, bufferSize);
             if (error_this_type) {
                 error += error_this_type;
                 log_error("Failure; skipping further sizes for this type.");
@@ -233,125 +317,59 @@ int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queu
             }
         }
     }
-
-    free_mtdata(mtData);
-
     return error;
 }
 
 #pragma mark -------------------- vload test cases --------------------------
 
-void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__global");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vloadset( device, context, queue, create_global_load_code, 10240 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_global_load_code, 10240);
 }
 
-
-void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
+                            ExplicitType type, size_t inVectorSize,
+                            size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ )\n"
-    "           sSharedStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ ) {\n"
-    "           sSharedStorage[ 3*i   ] = src[ 3*i   ];\n"
-    "           sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
-    "           sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
-    "        }\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid +1] = tmp.s1;\n"
-    "   results[ 3*tid +2] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
+    std::snprintf(load_str, sizeof(load_str), load_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, /*(int)inBufferSize,*/
-                typeName, typeName,
-                (int)inBufferSize,
-                typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, /*(int)inBufferSize,*/
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
+        std::snprintf(load_str, sizeof(load_str), load_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
+                             outTypeName.c_str(), (int)inBufferSize, typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -360,53 +378,34 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
+    if (localSize > 10240) localSize = 10240;
     if (localSize > 4096)
         localSize -= 2048;
     else
         localSize /= 2;
 
-    return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
+    return test_vset<test_vload>(device, context, queue, create_local_load_code,
+                                 (size_t)localSize);
 }
 
-
-void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__constant");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName,  typeName,
-                typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -415,109 +414,71 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu
     cl_ulong maxSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
     test_error( error, "Unable to get max size of constant memory buffer" );
-    if( maxSize > 10240 )
-        maxSize = 10240;
+    if (maxSize > 10240) maxSize = 10240;
     if (maxSize > 4096)
         maxSize -= 2048;
     else
         maxSize /= 2;
 
-    return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_constant_load_code, (size_t)maxSize);
 }
 
-
-void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s%d\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < %d; i++ )\n"
-    "      sPrivateStorage[ i ] = src[ i ];\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < PRIV_SIZE; i++ )\n"
-    "    {\n"
-    "        sPrivateStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize ==3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, 3*((int)inBufferSize),
-                typeName, typeName,
-                typeName );
-        // log_info("Src is \"\n%s\n\"\n", destBuffer);
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    int bufSize = (int)inBufferSize * 3;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        bufSize = (int)inBufferSize;
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
+                             inTypeName.c_str(), outTypeName.c_str(), typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vloadset( device, context, queue, create_private_load_code, 256 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_private_load_code, 256);
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma mark -------------------- vstore harness --------------------------
 
-typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
-
-int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-                create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
+                ExplicitType type, unsigned int vecSize,
+                create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 3 ];
+    MTdataHolder d(gRandomSeed);
 
     size_t threads[ 1 ], localThreads[ 1 ];
-
     size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     if (DEBUG)
         bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
@@ -534,39 +495,22 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     }
     if (DEBUG)
         log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    cl_uint offsets[ numStores ];
-#else
-    cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
-#endif
-    char programSrc[ 10240 ];
-    size_t i;
-
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char inBuffer[ numStores * typeSize * vecSize ];
-#else
-    char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
-#endif
+
+    std::vector<cl_uint> offsets(numStores);
+    std::vector<char> inBuffer(numStores * typeSize * vecSize);
+
     clProtectedArray outBuffer( numElements * typeSize * vecSize );
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char referenceBuffer[ numElements * typeSize * vecSize ];
-#else
-    char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
 
     // Create some random input data and random offsets to load from
-    generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
+    generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
 
     // Note: make sure no two offsets are the same, otherwise the output would depend on
     // the order that threads ran in, and that would be next to impossible to verify
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char flags[ numElements ];
-#else
-    char* flags = (char*)_malloca( numElements * sizeof(char));
-#endif
-
-    memset( flags, 0, numElements * sizeof(char) );
-    for( i = 0; i < numStores; i++ )
+    std::vector<char> flags(numElements);
+    flags.assign(flags.size(), 0);
+
+    for (size_t i = 0; i < numStores; i++)
     {
         do
         {
@@ -579,13 +523,15 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     if (LINEAR_OFFSETS)
         log_info("Offsets set to thread IDs to simplify output.\n");
 
-    createFn( programSrc, numElements, type, vecSize );
+    std::string programSrc;
+    createFn(programSrc, numElements, type, vecSize, vecSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -593,9 +539,14 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     test_error( error, "clGetKernelInfo failed");
 
     // Set up parameters
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
+    streams[0] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * typeSize * vecSize * sizeof(cl_char),
+                       (void *)inBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * sizeof(cl_uint), offsets.data(), &error);
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
     test_error( error, "Unable to create kernel stream" );
@@ -606,7 +557,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
         // We need to set the size of the local storage
         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
         test_error( error, "clSetKernelArg for buffer failed");
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
             test_error( error, "Unable to set kernel argument" );
@@ -615,11 +566,10 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     else
     {
         // No local storage
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
-            if (error)
-                log_info("%s\n", programSrc);
+            if (error) log_info("%s\n", programSrc.c_str());
             test_error( error, "Unable to set kernel argument" );
         }
     }
@@ -654,25 +604,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
         error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
         test_error( error, "Unable to read results" );
 
-
         // Create the reference results
-        memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
-        for( i = 0; i < numStores; i++ )
+        referenceBuffer.assign(referenceBuffer.size(), 0);
+        for (size_t i = 0; i < numStores; i++)
         {
-            memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
+            memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
+                                    * typeSize],
+                   &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
         }
 
         // Validate the results now
-        char *expected = referenceBuffer;
+        char *expected = referenceBuffer.data();
         char *actual = (char *)(void *)outBuffer;
 
         if (DEBUG)
         {
             log_info("Memory contents:\n");
-            for (i=0; i<numElements; i++)
+            char inString[1024];
+            char expectedString[1024], actualString[1024];
+            for (size_t i = 0; i < numElements; i++)
             {
-                char  inString[1024];
-                char expectedString[ 1024 ], actualString[ 1024 ];
                 if (i < numStores)
                 {
                     log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
@@ -693,7 +644,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
             }
         }
 
-        for( i = 0; i < numElements; i++ )
+        for (size_t i = 0; i < numElements; i++)
         {
             if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
             {
@@ -719,62 +670,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
             actual += typeSize * vecSize;
         }
     }
-
     return 0;
 }
 
-int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
-{
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
-    unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
-    const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
-    int error = 0;
-    MTdata d = init_genrand( gRandomSeed );
-
-    log_info("Testing with buffer size of %d.\n", (int)bufferSize);
-
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
-    {
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
-            continue;
-
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
-            continue;
-
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
-        {
-            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
-
-            int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
-            if (error_this_type)
-            {
-                log_error("Failure; skipping further sizes for this type.\n");
-                error += error_this_type;
-                break;
-            }
-        }
-    }
-
-    free_mtdata(d);
-    return error;
-}
-
-
 #pragma mark -------------------- vstore test cases --------------------------
 
-void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     "    vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
+    const char *patternV3 [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
@@ -783,45 +698,48 @@ void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitTy
     "    } else {\n"
     "      vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
     "    }\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
-
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, typeName, typeName);
-
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
+                                 typeName, (int)inVectorSize);
     }
-    // if(inVectorSize == 3 || inVectorSize == 4) {
-    //     log_info("\n----\n%s\n----\n", destBuffer);
-    // }
 }
 
 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_global_store_code, 10240);
 }
 
-
-void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    // clang-format off
+    const char *pattern[] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "#define LOC_VTYPE %s%d\n"
+    "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
+    " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
     " sSharedStorage[ offsets[tid] +1 ] =  sSharedStorage[ offsets[tid] ];\n"
     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
     "\n"
-    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
+    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
     "\n"
     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
@@ -830,20 +748,20 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
+    const char *patternV3 [] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    "    sSharedStorage[ 3*offsets[tid]   ] = (%s)0;\n"
+    "    sSharedStorage[ 3*offsets[tid]   ] = (LOC_TYPE)0;\n"
     "    sSharedStorage[ 3*offsets[tid] +1 ] =  \n"
     "        sSharedStorage[ 3*offsets[tid] ];\n"
     "    sSharedStorage[ 3*offsets[tid] +2 ] =  \n"
@@ -865,30 +783,26 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,
-                typeName,
-                typeName,  typeName,
-                typeName, typeName, typeName  );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName,
-                typeName, typeName, typeName  );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName,
+                                 (int)inVectorSize, (int)inVectorSize);
     }
-    // log_info(destBuffer);
 }
 
 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -897,81 +811,82 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
+    if (localSize > 10240) localSize = 10240;
     if (localSize > 4096)
         localSize -= 2048;
     else
         localSize /= 2;
-    return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_local_store_code, (size_t)localSize);
 }
 
-
-void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s%d\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s%d sPrivateStorage[ %d ];\n"
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-    "   vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
+    "  vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "\n"
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  uint i;\n"
-    "  __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
-
+    "}\n"};
 
-    const char *patternV3 =
-    "%s"
+    const char *patternV3  [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s3\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s3)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-
-    "   vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
-    "\n"
-    // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
-    // otherwise, local threads would be overwriting results from other local threads
+    "  vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "  uint i;\n"
-    "  __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
-    "  __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n"};
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,  typeName,
-                typeName, (int)inBufferSize,
-                typeName, typeName,
-                typeName, typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
+                        (int)inBufferSize, (int)inVectorSize);
     }
 }
 
@@ -979,7 +894,8 @@ int test_vstore_private(cl_device_id device, cl_context context, cl_command_queu
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vstoreset( device, context, queue, create_private_store_code, 256 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_private_store_code, 256);
 }
 
 
diff --git a/test_conformance/basic/test_wg_barrier.cpp b/test_conformance/basic/test_wg_barrier.cpp
deleted file mode 100644
index a237d80b9..000000000
--- a/test_conformance/basic/test_wg_barrier.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "harness/compat.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-
-#include "procs.h"
-
-const char *wg_barrier_kernel_code =
-"__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
-"{\n"
-"    int  tid = get_local_id(0);\n"
-"    int  lsize = get_local_size(0);\n"
-"    int  i;\n"
-"\n"
-"    tmp_sum[tid] = 0;\n"
-"    for (i=tid; i<n; i+=lsize)\n"
-"        tmp_sum[tid] += a[i];\n"
-"     \n"
-"     // updated to work for any workgroup size \n"
-"    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
-"    {\n"
-"        work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"        if (tid + i < lsize)\n"
-"            tmp_sum[tid] += tmp_sum[tid + i];\n"
-"         lsize = i; \n"
-"    }\n"
-"\n"
-"     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
-"    if (tid == 0)\n"
-"        *sum = tmp_sum[0];\n"
-"}\n";
-
-
-static int
-verify_sum(int *inptr, int *tmpptr, int *outptr, int n)
-{
-    int i;
-    int reference = 0;
-
-    for (i=0; i<n; i++)
-    {
-        reference += inptr[i];
-    }
-
-    if (reference != outptr[0])
-    {
-        log_error("work_group_barrier test failed\n");
-        return -1;
-    }
-
-    log_info("work_group_barrier test passed\n");
-    return 0;
-}
-
-
-int
-test_wg_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    cl_mem            streams[3];
-    cl_int            *input_ptr = NULL, *output_ptr = NULL, *tmp_ptr =NULL;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    global_threads[3];
-    size_t    local_threads[3];
-    int                err;
-    int                i;
-    size_t max_local_workgroup_size[3];
-    size_t max_threadgroup_size = 0;
-    MTdata d;
-
-    err = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel, 1, &wg_barrier_kernel_code, "compute_sum",
-        nullptr);
-    test_error(err, "Failed to build kernel/program.");
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                 sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
-    test_error(err, "clGetKernelWorkgroupInfo failed.");
-
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
-    test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
-
-    // Pick the minimum of the device and the kernel
-    if (max_threadgroup_size > max_local_workgroup_size[0])
-        max_threadgroup_size = max_local_workgroup_size[0];
-
-    // work group size must divide evenly into the global size
-    while( num_elements % max_threadgroup_size )
-        max_threadgroup_size--;
-
-    input_ptr = (int*)malloc(sizeof(int) * num_elements);
-    output_ptr = (int*)malloc(sizeof(int));
-
-    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * num_elements, NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-    streams[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-    streams[2] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * max_threadgroup_size, NULL, &err);
-    test_error(err, "clCreateBuffer failed.");
-
-    d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
-        input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
-    free_mtdata(d);  d = NULL;
-
-    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
-    test_error(err, "clEnqueueWriteBuffer failed.");
-
-    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
-    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
-    err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
-    test_error(err, "clSetKernelArg failed.");
-
-    global_threads[0] = max_threadgroup_size;
-    local_threads[0] = max_threadgroup_size;
-
-    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
-    test_error(err, "clEnqueueNDRangeKernel failed.");
-
-    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
-    test_error(err, "clEnqueueReadBuffer failed.");
-
-    err = verify_sum(input_ptr, tmp_ptr, output_ptr, num_elements);
-
-    // cleanup
-    clReleaseMemObject(streams[0]);
-    clReleaseMemObject(streams[1]);
-    clReleaseMemObject(streams[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    free(input_ptr);
-    free(output_ptr);
-
-    return err;
-}
diff --git a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
index 0a459e973..4e92e709c 100644
--- a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
+++ b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
@@ -740,21 +740,42 @@ clExternalSemaphore::clExternalSemaphore(
     cl_int err = 0;
     cl_device_id devList[] = { deviceId, NULL };
 
-#ifdef _WIN32
-    if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32"))
-    {
-        throw std::runtime_error("Device does not support "
-                                 "cl_khr_external_semaphore_win32 extension\n");
-    }
-#elif !defined(__APPLE__)
-    if (!is_extension_available(devList[0],
-                                "cl_khr_external_semaphore_opaque_fd"))
+    switch (externalSemaphoreHandleType)
     {
-        throw std::runtime_error(
-            "Device does not support cl_khr_external_semaphore_opaque_fd "
-            "extension \n");
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_opaque_fd"))
+            {
+                throw std::runtime_error("Device does not support "
+                                         "cl_khr_external_semaphore_opaque_fd "
+                                         "extension \n");
+            }
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_win32"))
+            {
+                throw std::runtime_error(
+                    "Device does not support "
+                    "cl_khr_external_semaphore_win32 extension\n");
+            }
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            if (!is_extension_available(devList[0],
+                                        "cl_khr_external_semaphore_sync_fd"))
+            {
+                throw std::runtime_error(
+                    "Device does not support cl_khr_external_semaphore_sync_fd "
+                    "extension \n");
+            }
+            break;
+        default:
+            throw std::runtime_error(
+                "Unsupported external semaphore handle type\n");
+            break;
     }
-#endif
 
     std::vector<cl_semaphore_properties_khr> sema_props{
         (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
@@ -803,6 +824,16 @@ clExternalSemaphore::clExternalSemaphore(
             sema_props.push_back((cl_semaphore_properties_khr)handle);
 #endif
             break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            err = check_external_semaphore_handle_type(
+                devList[0], CL_SEMAPHORE_HANDLE_SYNC_FD_KHR);
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR));
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_HANDLE_SYNC_FD_KHR));
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR));
+            break;
         default:
             ASSERT(0);
             log_error("Unsupported external memory handle type\n");
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp b/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
index 1a313cce4..96c5adbc7 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_utility.cpp
@@ -248,6 +248,9 @@ getSupportedVulkanExternalSemaphoreHandleTypeList()
     }
     externalSemaphoreHandleTypeList.push_back(
         VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT);
+#elif defined(__ANDROID__)
+    externalSemaphoreHandleTypeList.push_back(
+        VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD);
 #else
     externalSemaphoreHandleTypeList.push_back(
         VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD);
@@ -480,6 +483,33 @@ const std::vector<VulkanFormat> getSupportedVulkanFormatList()
     return formatList;
 }
 
+cl_external_semaphore_handle_type_khr getCLSemaphoreTypeFromVulkanType(
+    VulkanExternalSemaphoreHandleType vulkanExternalSemaphoreHandleType)
+{
+    cl_external_semaphore_handle_type_khr clExternalSemaphoreHandleTypeKhr = 0;
+    switch (vulkanExternalSemaphoreHandleType)
+    {
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT:
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT:
+            clExternalSemaphoreHandleTypeKhr =
+                CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR;
+            break;
+        case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
+            clExternalSemaphoreHandleTypeKhr = CL_SEMAPHORE_HANDLE_SYNC_FD_KHR;
+            break;
+        default: break;
+    }
+    return clExternalSemaphoreHandleTypeKhr;
+}
+
 uint32_t getVulkanFormatElementSize(VulkanFormat format)
 {
     switch (format)
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp b/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
index 04f5a5940..989132570 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_utility.hpp
@@ -51,6 +51,8 @@ const std::vector<VulkanFormat> getSupportedVulkanFormatList();
 uint32_t getVulkanFormatElementSize(VulkanFormat format);
 const char* getVulkanFormatGLSLFormat(VulkanFormat format);
 const char* getVulkanFormatGLSLTypePrefix(VulkanFormat format);
+cl_external_semaphore_handle_type_khr getCLSemaphoreTypeFromVulkanType(
+    VulkanExternalSemaphoreHandleType vulkanExternalSemaphoreHandleType);
 
 std::string prepareVulkanShader(
     std::string shaderCode,
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
index 3ce4af6b0..4d803be48 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.cpp
@@ -72,6 +72,8 @@ VulkanInstance::VulkanInstance(): m_vkInstance(VK_NULL_HANDLE)
 
 #if defined(_WIN32) || defined(_WIN64)
     const char *vulkanLoaderLibraryName = "vulkan-1.dll";
+#elif defined(__ANDROID__)
+    const char *vulkanLoaderLibraryName = "libvulkan.so";
 #elif defined(__linux__)
     const char *vulkanLoaderLibraryName = "libvulkan.so.1";
 #endif
@@ -604,6 +606,37 @@ VulkanQueue &VulkanDevice::getQueue(const VulkanQueueFamily &queueFamily,
 
 VulkanDevice::operator VkDevice() const { return m_vkDevice; }
 
+////////////////////////////////
+// VulkanFence implementation //
+////////////////////////////////
+
+VulkanFence::VulkanFence(const VulkanDevice &vkDevice)
+{
+
+    device = vkDevice;
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fenceInfo.pNext = nullptr;
+    fenceInfo.flags = 0;
+
+    VkResult vkStatus = vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    if (vkStatus != VK_SUCCESS)
+    {
+        throw std::runtime_error("Error: Failed create fence.");
+    }
+}
+
+VulkanFence::~VulkanFence() { vkDestroyFence(device, fence, nullptr); }
+
+void VulkanFence::reset() { vkResetFences(device, 1, &fence); }
+
+void VulkanFence::wait()
+{
+    vkWaitForFences(device, 1, &fence, VK_TRUE, UINT64_MAX);
+}
+
 ////////////////////////////////
 // VulkanQueue implementation //
 ////////////////////////////////
@@ -615,6 +648,22 @@ VulkanQueue::VulkanQueue(VkQueue vkQueue): m_vkQueue(vkQueue) {}
 
 VulkanQueue::~VulkanQueue() {}
 
+void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer,
+                         const std::shared_ptr<VulkanFence> &vkFence)
+{
+    VulkanCommandBufferList commandBufferList;
+    commandBufferList.add(commandBuffer);
+
+    VkSubmitInfo vkSubmitInfo = {};
+    vkSubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    vkSubmitInfo.pNext = NULL;
+    vkSubmitInfo.waitSemaphoreCount = (uint32_t)0;
+    vkSubmitInfo.commandBufferCount = (uint32_t)commandBufferList.size();
+    vkSubmitInfo.pCommandBuffers = commandBufferList();
+
+    vkQueueSubmit(m_vkQueue, 1, &vkSubmitInfo, vkFence->fence);
+}
+
 void VulkanQueue::submit(const VulkanSemaphoreList &waitSemaphoreList,
                          const VulkanCommandBufferList &commandBufferList,
                          const VulkanSemaphoreList &signalSemaphoreList)
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
index 37925ee4a..af4782191 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper.hpp
@@ -21,6 +21,7 @@
 #include "vulkan_wrapper_types.hpp"
 #include "vulkan_list_map.hpp"
 #include "vulkan_api_list.hpp"
+#include <memory>
 
 class VulkanInstance {
     friend const VulkanInstance &getVulkanInstance();
@@ -145,6 +146,20 @@ class VulkanDevice {
     operator VkDevice() const;
 };
 
+class VulkanFence {
+    friend class VulkanQueue;
+
+protected:
+    VkFence fence;
+    VkDevice device;
+
+public:
+    VulkanFence(const VulkanDevice &device);
+    virtual ~VulkanFence();
+    void reset();
+    void wait();
+};
+
 class VulkanQueue {
     friend class VulkanDevice;
 
@@ -157,6 +172,8 @@ class VulkanQueue {
 
 public:
     const VulkanQueueFamily &getQueueFamily();
+    void submit(const VulkanCommandBuffer &commandBuffer,
+                const std::shared_ptr<VulkanFence> &fence);
     void submit(const VulkanSemaphoreList &waitSemaphoreList,
                 const VulkanCommandBufferList &commandBufferList,
                 const VulkanSemaphoreList &signalSemaphoreList);
@@ -569,7 +586,6 @@ class VulkanSemaphore {
     operator VkSemaphore() const;
 };
 
-
 #define VK_FUNC_DECL(name) extern "C" PFN_##name _##name;
 VK_FUNC_LIST
 #if defined(_WIN32) || defined(_WIN64)
diff --git a/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp b/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
index 2473a1d7b..fcd193732 100644
--- a/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
+++ b/test_conformance/common/vulkan_wrapper/vulkan_wrapper_types.hpp
@@ -169,7 +169,9 @@ enum VulkanExternalSemaphoreHandleType
         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
     VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT =
         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
-        | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR
+        | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
+    VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD =
+        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR
 };
 
 enum VulkanBufferUsage
diff --git a/test_conformance/commonfns/main.cpp b/test_conformance/commonfns/main.cpp
index 3e4b0b8e7..645d3f703 100644
--- a/test_conformance/commonfns/main.cpp
+++ b/test_conformance/commonfns/main.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,8 +18,10 @@
 #include <string.h>
 #include "procs.h"
 #include "test_base.h"
+#include "harness/kernelHelpers.h"
 
 std::map<size_t, std::string> BaseFunctionTest::type2name;
+cl_half_rounding_mode BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
 
 int g_arrVecSizes[kVectorSizeCount + kStrangeVectorSizeCount];
 int g_arrStrangeVectorSizes[kStrangeVectorSizeCount] = {3};
@@ -45,17 +47,38 @@ test_definition test_list[] = {
 
 const int test_num = ARRAY_SIZE( test_list );
 
-int main(int argc, const char *argv[])
+test_status InitCL(cl_device_id device)
 {
-    initVecSizes();
-
-    if (BaseFunctionTest::type2name.empty())
+    if (is_extension_available(device, "cl_khr_fp16"))
     {
-        BaseFunctionTest::type2name[sizeof(half)] = "half";
-        BaseFunctionTest::type2name[sizeof(float)] = "float";
-        BaseFunctionTest::type2name[sizeof(double)] = "double";
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            BaseFunctionTest::halfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
     }
 
-    return runTestHarness(argc, argv, test_num, test_list, false, 0);
+    return TEST_PASS;
 }
 
+int main(int argc, const char *argv[])
+{
+    initVecSizes();
+
+    BaseFunctionTest::type2name[sizeof(half)] = "half";
+    BaseFunctionTest::type2name[sizeof(float)] = "float";
+    BaseFunctionTest::type2name[sizeof(double)] = "double";
+
+    return runTestHarnessWithCheck(argc, argv, test_num, test_list, false, 0,
+                                   InitCL);
+}
diff --git a/test_conformance/commonfns/test_base.h b/test_conformance/commonfns/test_base.h
index 442910426..be36ed264 100644
--- a/test_conformance/commonfns/test_base.h
+++ b/test_conformance/commonfns/test_base.h
@@ -19,27 +19,23 @@
 #include <vector>
 #include <map>
 #include <memory>
+#include <cmath>
 
 #include <CL/cl_half.h>
 #include <CL/cl_ext.h>
 
-#include "harness/deviceInfo.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 
-
 template <typename T>
 using VerifyFuncBinary = int (*)(const T *const, const T *const, const T *const,
                                  const int num, const int vs, const int vp);
 
-
 template <typename T>
 using VerifyFuncUnary = int (*)(const T *const, const T *const, const int num);
 
-
 using half = cl_half;
 
-
 struct BaseFunctionTest
 {
     BaseFunctionTest(cl_device_id device, cl_context context,
@@ -61,9 +57,9 @@ struct BaseFunctionTest
     bool vecParam;
 
     static std::map<size_t, std::string> type2name;
+    static cl_half_rounding_mode halfRoundingMode;
 };
 
-
 struct MinTest : BaseFunctionTest
 {
     MinTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -74,7 +70,6 @@ struct MinTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct MaxTest : BaseFunctionTest
 {
     MaxTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -85,7 +80,6 @@ struct MaxTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct ClampTest : BaseFunctionTest
 {
     ClampTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -96,7 +90,6 @@ struct ClampTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct DegreesTest : BaseFunctionTest
 {
     DegreesTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -107,7 +100,6 @@ struct DegreesTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct RadiansTest : BaseFunctionTest
 {
     RadiansTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -118,7 +110,6 @@ struct RadiansTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct SignTest : BaseFunctionTest
 {
     SignTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -129,7 +120,6 @@ struct SignTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct SmoothstepTest : BaseFunctionTest
 {
     SmoothstepTest(cl_device_id device, cl_context context,
@@ -141,7 +131,6 @@ struct SmoothstepTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct StepTest : BaseFunctionTest
 {
     StepTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -152,7 +141,6 @@ struct StepTest : BaseFunctionTest
     cl_int Run() override;
 };
 
-
 struct MixTest : BaseFunctionTest
 {
     MixTest(cl_device_id device, cl_context context, cl_command_queue queue,
@@ -163,19 +151,71 @@ struct MixTest : BaseFunctionTest
     cl_int Run() override;
 };
 
+template <typename T> float UlpFn(const T &val, const double &r)
+{
+    if (std::is_same<T, half>::value)
+    {
+        return Ulp_Error_Half(val, r);
+    }
+    else if (std::is_same<T, float>::value)
+    {
+        return Ulp_Error(val, r);
+    }
+    else if (std::is_same<T, double>::value)
+    {
+        return Ulp_Error_Double(val, r);
+    }
+    else
+    {
+        log_error("UlpFn: unsupported data type\n");
+    }
+
+    return -1.f; // wrong val
+}
+
+template <typename T> inline double conv_to_dbl(const T &val)
+{
+    if (std::is_same<T, half>::value)
+        return (double)cl_half_to_float(val);
+    else
+        return (double)val;
+}
 
-template <typename... Args>
-std::string string_format(const std::string &format, Args... args)
+template <typename T> inline double conv_to_flt(const T &val)
 {
-    int sformat = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;
-    if (sformat <= 0)
-        throw std::runtime_error("string_format: string processing error.");
-    auto format_size = static_cast<size_t>(sformat);
-    std::unique_ptr<char[]> buffer(new char[format_size]);
-    std::snprintf(buffer.get(), format_size, format.c_str(), args...);
-    return std::string(buffer.get(), buffer.get() + format_size - 1);
+    if (std::is_same<T, half>::value)
+        return (float)cl_half_to_float(val);
+    else
+        return (float)val;
 }
 
+template <typename T> inline half conv_to_half(const T &val)
+{
+    if (std::is_floating_point<T>::value)
+        return cl_half_from_float(val, BaseFunctionTest::halfRoundingMode);
+    return 0;
+}
+
+template <typename T> bool isfinite_fp(const T &v)
+{
+    if (std::is_same<T, half>::value)
+    {
+        // Extract FP16 exponent and mantissa
+        uint16_t h_exp = (((half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+        uint16_t h_mant = ((half)v) & 0x3FF;
+
+        // !Inf test
+        return !(h_exp == 0x1F && h_mant == 0);
+    }
+    else
+    {
+#if !defined(_WIN32)
+        return std::isfinite(v);
+#else
+        return isfinite(v);
+#endif
+    }
+}
 
 template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
diff --git a/test_conformance/commonfns/test_binary_fn.cpp b/test_conformance/commonfns/test_binary_fn.cpp
index 1eb12f730..a6c75647d 100644
--- a/test_conformance/commonfns/test_binary_fn.cpp
+++ b/test_conformance/commonfns/test_binary_fn.cpp
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 
 #include "harness/deviceInfo.h"
 #include "harness/typeWrappers.h"
+#include "harness/stringHelpers.h"
 
 #include "procs.h"
 #include "test_base.h"
@@ -53,7 +54,6 @@ const char *binary_fn_code_pattern_v3_scalar =
 "    vstore3(%s(vload3(tid,x), y[tid] ), tid, dst);\n"
 "}\n";
 
-
 template <typename T>
 int test_binary_fn(cl_device_id device, cl_context context,
                    cl_command_queue queue, int n_elems,
@@ -105,6 +105,16 @@ int test_binary_fn(cl_device_id device, cl_context context,
             input_ptr[1][j] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -125,22 +135,22 @@ int test_binary_fn(cl_device_id device, cl_context context,
             {
                 std::string str = binary_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
             else
             {
                 std::string str = binary_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), fnName.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), fnName.c_str());
             }
         }
         else
         {
             // do regular
             std::string str = binary_fn_code_pattern;
-            kernelSource = string_format(
+            kernelSource = str_sprintf(
                 str, pragma_str.c_str(), tname.c_str(), vecSizeNames[i],
                 tname.c_str(), vecSecParam ? vecSizeNames[i] : "",
                 tname.c_str(), vecSizeNames[i], fnName.c_str());
@@ -203,13 +213,20 @@ int max_verify(const T* const x, const T* const y, const T* const out,
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] < y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) < conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -227,13 +244,20 @@ int min_verify(const T* const x, const T* const y, const T* const out,
         {
             int k = i * vecSize + j;
             int l = (k * vecParam + i * (1 - vecParam));
-            T v = (x[k] > y[l]) ? y[l] : x[k];
+            T v = (conv_to_dbl(x[k]) > conv_to_dbl(y[l])) ? y[l] : x[k];
             if (v != out[k])
             {
-                log_error(
-                    "x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. (index %d is "
-                    "vector %d, element %d, for vector size %d)\n",
-                    k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
+                if (std::is_same<T, half>::value)
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, conv_to_flt(x[k]), l, conv_to_flt(y[l]), k,
+                              conv_to_flt(out[k]), v, k, i, j, vecSize);
+                else
+                    log_error("x[%d]=%g y[%d]=%g out[%d]=%g, expected %g. "
+                              "(index %d is "
+                              "vector %d, element %d, for vector size %d)\n",
+                              k, x[k], l, y[l], k, out[k], v, k, i, j, vecSize);
                 return -1;
             }
         }
@@ -246,6 +270,13 @@ int min_verify(const T* const x, const T* const y, const T* const out,
 cl_int MaxTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        max_verify<cl_half>);
+        test_error(error, "MaxTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, max_verify<float>);
@@ -265,6 +296,13 @@ cl_int MaxTest::Run()
 cl_int MinTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_binary_fn<cl_half>(device, context, queue, num_elems,
+                                        fnName.c_str(), vecParam,
+                                        min_verify<cl_half>);
+        test_error(error, "MinTest::Run<cl_half> failed");
+    }
 
     error = test_binary_fn<float>(device, context, queue, num_elems,
                                   fnName.c_str(), vecParam, min_verify<float>);
diff --git a/test_conformance/commonfns/test_clamp.cpp b/test_conformance/commonfns/test_clamp.cpp
index 0e96fb602..1bf406770 100644
--- a/test_conformance/commonfns/test_clamp.cpp
+++ b/test_conformance/commonfns/test_clamp.cpp
@@ -26,12 +26,10 @@
 #include "procs.h"
 #include "test_base.h"
 
-
 #ifndef M_PI
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 #define CLAMP_KERNEL(type)                                                     \
     const char *clamp_##type##_kernel_code = EMIT_PRAGMA_DIRECTIVE             \
         "__kernel void test_clamp(__global " #type " *x, __global " #type      \
@@ -64,6 +62,14 @@
         "vload3(tid,maxval)), tid, dst);\n"                                    \
         "}\n";
 
+#define EMIT_PRAGMA_DIRECTIVE "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+CLAMP_KERNEL(half)
+CLAMP_KERNEL_V(half, 2)
+CLAMP_KERNEL_V(half, 4)
+CLAMP_KERNEL_V(half, 8)
+CLAMP_KERNEL_V(half, 16)
+CLAMP_KERNEL_V3(half, 3)
+#undef EMIT_PRAGMA_DIRECTIVE
 
 #define EMIT_PRAGMA_DIRECTIVE " "
 CLAMP_KERNEL(float)
@@ -83,6 +89,10 @@ CLAMP_KERNEL_V(double, 16)
 CLAMP_KERNEL_V3(double, 3)
 #undef EMIT_PRAGMA_DIRECTIVE
 
+const char *clamp_half_codes[] = {
+    clamp_half_kernel_code,  clamp_half2_kernel_code,  clamp_half4_kernel_code,
+    clamp_half8_kernel_code, clamp_half16_kernel_code, clamp_half3_kernel_code
+};
 const char *clamp_float_codes[] = {
     clamp_float_kernel_code,   clamp_float2_kernel_code,
     clamp_float4_kernel_code,  clamp_float8_kernel_code,
@@ -96,21 +106,42 @@ const char *clamp_double_codes[] = {
 
 namespace {
 
-
 template <typename T>
 int verify_clamp(const T *const x, const T *const minval, const T *const maxval,
                  const T *const outptr, int n)
 {
-    T t;
-    for (int i = 0; i < n; i++)
+    if (std::is_same<T, half>::value)
+    {
+        float t;
+        for (int i = 0; i < n; i++)
+        {
+            t = std::min(
+                std::max(cl_half_to_float(x[i]), cl_half_to_float(minval[i])),
+                cl_half_to_float(maxval[i]));
+            if (t != cl_half_to_float(outptr[i]))
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, cl_half_to_float(x[i]), cl_half_to_float(minval[i]),
+                    cl_half_to_float(maxval[i]), t,
+                    cl_half_to_float(outptr[i]));
+                return -1;
+            }
+        }
+    }
+    else
     {
-        t = std::min(std::max(x[i], minval[i]), maxval[i]);
-        if (t != outptr[i])
+        T t;
+        for (int i = 0; i < n; i++)
         {
-            log_error(
-                "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n", i,
-                x[i], minval[i], maxval[i], t, outptr[i]);
-            return -1;
+            t = std::min(std::max(x[i], minval[i]), maxval[i]);
+            if (t != outptr[i])
+            {
+                log_error(
+                    "%d) verification error: clamp( %a, %a, %a) = *%a vs. %a\n",
+                    i, x[i], minval[i], maxval[i], t, outptr[i]);
+                return -1;
+            }
         }
     }
 
@@ -118,7 +149,6 @@ int verify_clamp(const T *const x, const T *const minval, const T *const maxval,
 }
 }
 
-
 template <typename T>
 int test_clamp_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems)
@@ -169,6 +199,17 @@ int test_clamp_fn(cl_device_id device, cl_context context,
             input_ptr[2][j] = get_random_double(input_ptr[1][j], 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        for (j = 0; j < num_elements; j++)
+        {
+            input_ptr[0][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][j] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[2][j] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[1][j]), fval, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -194,9 +235,16 @@ int test_clamp_fn(cl_device_id device, cl_context context,
                 "test_clamp");
             test_error(err, "Unable to create kernel");
         }
+        else if (std::is_same<T, half>::value)
+        {
+            err = create_single_kernel_helper(
+                context, &programs[i], &kernels[i], 1, &clamp_half_codes[i],
+                "test_clamp");
+            test_error(err, "Unable to create kernel");
+        }
 
-        log_info("Just made a program for float, i=%d, size=%d, in slot %d\n",
-                 i, g_arrVecSizes[i], i);
+        log_info("Just made a program for %s, i=%d, size=%d, in slot %d\n",
+                 tname.c_str(), i, g_arrVecSizes[i], i);
         fflush(stdout);
 
         for (j = 0; j < 4; j++)
@@ -239,10 +287,14 @@ int test_clamp_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int ClampTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_clamp_fn<cl_half>(device, context, queue, num_elems);
+        test_error(error, "ClampTest::Run<cl_half> failed");
+    }
 
     error = test_clamp_fn<float>(device, context, queue, num_elems);
     test_error(error, "ClampTest::Run<float> failed");
@@ -256,7 +308,6 @@ cl_int ClampTest::Run()
     return error;
 }
 
-
 int test_clamp(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_mix.cpp b/test_conformance/commonfns/test_mix.cpp
index 92c101005..2a06e43df 100644
--- a/test_conformance/commonfns/test_mix.cpp
+++ b/test_conformance/commonfns/test_mix.cpp
@@ -18,6 +18,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
@@ -52,33 +54,42 @@ const char *mix_fn_code_pattern_v3_scalar =
     "    vstore3(mix(vload3(tid, x), vload3(tid, y), a[tid]), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR 1e-3
 
 namespace {
 
-
 template <typename T>
 int verify_mix(const T *const inptrX, const T *const inptrY,
                const T *const inptrA, const T *const outptr, const int n,
                const int veclen, const bool vecParam)
 {
-    T r;
-    float delta = 0.0f;
+    double r, o;
+    float delta = 0.f, max_delta = 0.f;
     int i;
 
     if (vecParam)
     {
         for (i = 0; i < n * veclen; i++)
         {
-            r = inptrX[i] + ((inptrY[i] - inptrX[i]) * inptrA[i]);
-            delta = fabs(double(r - outptr[i])) / r;
-            if (delta > MAX_ERR)
+            r = conv_to_dbl(inptrX[i])
+                + ((conv_to_dbl(inptrY[i]) - conv_to_dbl(inptrX[i]))
+                   * conv_to_dbl(inptrA[i]));
+
+            o = conv_to_dbl(outptr[i]);
+            delta = fabs(double(r - o)) / r;
+            if (!std::is_same<T, half>::value)
+            {
+                if (delta > MAX_ERR)
+                {
+                    log_error("%d) verification error: mix(%a, %a, %a) = *%a "
+                              "vs. %a\n",
+                              i, inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
+                    return -1;
+                }
+            }
+            else
             {
-                log_error(
-                    "%d) verification error: mix(%a, %a, %a) = *%a vs. %a\n", i,
-                    inptrX[i], inptrY[i], inptrA[i], r, outptr[i]);
-                return -1;
+                max_delta = std::max(max_delta, delta);
             }
         }
     }
@@ -90,25 +101,40 @@ int verify_mix(const T *const inptrX, const T *const inptrY,
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                r = inptrX[vi] + ((inptrY[vi] - inptrX[vi]) * inptrA[i]);
-                delta = fabs(double(r - outptr[vi])) / r;
-                if (delta > MAX_ERR)
+                r = conv_to_dbl(inptrX[vi])
+                    + ((conv_to_dbl(inptrY[vi]) - conv_to_dbl(inptrX[vi]))
+                       * conv_to_dbl(inptrA[i]));
+                delta = fabs(double(r - conv_to_dbl(outptr[vi]))) / r;
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: mix(%a, "
-                              "%a, %a) = *%a vs. %a\n",
-                              ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
-                              outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error(
+                            "{%d, element %d}) verification error: mix(%a, "
+                            "%a, %a) = *%a vs. %a\n",
+                            ii, j, inptrX[vi], inptrY[vi], inptrA[i], r,
+                            outptr[vi]);
+                        return -1;
+                    }
+                }
+                else
+                {
+                    max_delta = std::max(max_delta, delta);
                 }
             }
         }
     }
 
+    // due to the fact that accuracy of mix for cl_khr_fp16 is implementation
+    // defined this test only reports maximum error without testing maximum
+    // error threshold
+    if (std::is_same<T, half>::value)
+        log_error("mix half verification result, max delta: %a\n", max_delta);
+
     return 0;
 }
 } // namespace
 
-
 template <typename T>
 int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
                 int n_elems, bool vecParam)
@@ -120,7 +146,7 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     std::vector<clKernelWrapper> kernels;
 
     int err, i;
-    MTdataHolder d = MTdataHolder(gRandomSeed);
+    MTdataHolder d(gRandomSeed);
 
     assert(BaseFunctionTest::type2name.find(sizeof(T))
            != BaseFunctionTest::type2name.end());
@@ -142,19 +168,32 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
         test_error(err, "clCreateBuffer failed");
     }
 
-    for (i = 0; i < num_elements; i++)
-    {
-        input_ptr[0][i] = (T)genrand_real1(d);
-        input_ptr[1][i] = (T)genrand_real1(d);
-        input_ptr[2][i] = (T)genrand_real1(d);
-    }
-
     std::string pragma_str;
     if (std::is_same<T, double>::value)
     {
         pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
     }
 
+    if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[1][i] = conv_to_half((float)genrand_real1(d));
+            input_ptr[2][i] = conv_to_half((float)genrand_real1(d));
+        }
+    }
+    else
+    {
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = (T)genrand_real1(d);
+            input_ptr[1][i] = (T)genrand_real1(d);
+            input_ptr[2][i] = (T)genrand_real1(d);
+        }
+    }
+
     for (i = 0; i < 3; i++)
     {
         err = clEnqueueWriteBuffer(queue, streams[i], CL_TRUE, 0,
@@ -164,7 +203,6 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     }
 
     char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
-
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
@@ -174,15 +212,15 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
             {
                 std::string str = mix_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = mix_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -190,10 +228,10 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
             // regular path
             std::string str = mix_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i],
-                              tname.c_str(), vecParam ? vecSizeNames[i] : "",
-                              tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i],
+                            tname.c_str(), vecParam ? vecSizeNames[i] : "",
+                            tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -242,10 +280,14 @@ int test_mix_fn(cl_device_id device, cl_context context, cl_command_queue queue,
     return err;
 }
 
-
 cl_int MixTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_mix_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "MixTest::Run<cl_half> failed");
+    }
 
     error = test_mix_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "MixTest::Run<float> failed");
@@ -260,7 +302,6 @@ cl_int MixTest::Run()
     return error;
 }
 
-
 int test_mix(cl_device_id device, cl_context context, cl_command_queue queue,
              int n_elems)
 {
@@ -268,7 +309,6 @@ int test_mix(cl_device_id device, cl_context context, cl_command_queue queue,
                                    true);
 }
 
-
 int test_mixf(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/commonfns/test_smoothstep.cpp b/test_conformance/commonfns/test_smoothstep.cpp
index 31948d3fe..5afc2d0f2 100644
--- a/test_conformance/commonfns/test_smoothstep.cpp
+++ b/test_conformance/commonfns/test_smoothstep.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *smoothstep_fn_code_pattern =
     "%s\n" /* optional pragma */
     "__kernel void test_fn(__global %s%s *e0, __global %s%s *e1, __global %s%s "
@@ -53,38 +54,43 @@ const char *smoothstep_fn_code_pattern_v3_scalar =
     "    vstore3(smoothstep(e0[tid], e1[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 #define MAX_ERR (1e-5f)
 
 namespace {
 
-
 template <typename T>
 int verify_smoothstep(const T *const edge0, const T *const edge1,
                       const T *const x, const T *const outptr, const int n,
                       const int veclen, const bool vecParam)
 {
-    T r, t;
-    float delta = 0;
+    double r, t;
+    float delta = 0, max_delta = 0;
 
     if (vecParam)
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            t = (x[i] - edge0[i]) / (edge1[i] - edge0[i]);
-            if (t < 0.0f)
-                t = 0.0f;
-            else if (t > 1.0f)
-                t = 1.0f;
-            r = t * t * (3.0f - 2.0f * t);
-            delta = (float)fabs(r - outptr[i]);
-            if (delta > MAX_ERR)
+            t = (conv_to_dbl(x[i]) - conv_to_dbl(edge0[i]))
+                / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+            if (t < 0.0)
+                t = 0.0;
+            else if (t > 1.0)
+                t = 1.0;
+            r = t * t * (3.0 - 2.0 * t);
+            delta = (float)fabs(r - conv_to_dbl(outptr[i]));
+            if (!std::is_same<T, half>::value)
             {
-                log_error("%d) verification error: smoothstep(%a, %a, %a) = "
-                          "*%a vs. %a\n",
-                          i, x[i], edge0[i], edge1[i], r, outptr[i]);
-                return -1;
+                if (delta > MAX_ERR)
+                {
+                    log_error(
+                        "%d) verification error: smoothstep(%a, %a, %a) = "
+                        "*%a vs. %a\n",
+                        i, x[i], edge0[i], edge1[i], r, outptr[i]);
+                    return -1;
+                }
             }
+            else
+                max_delta = std::max(max_delta, delta);
         }
     }
     else
@@ -95,32 +101,48 @@ int verify_smoothstep(const T *const edge0, const T *const edge1,
             int vi = i * veclen;
             for (int j = 0; j < veclen; ++j, ++vi)
             {
-                t = (x[vi] - edge0[i]) / (edge1[i] - edge0[i]);
-                if (t < 0.0f)
-                    t = 0.0f;
-                else if (t > 1.0f)
-                    t = 1.0f;
-                r = t * t * (3.0f - 2.0f * t);
-                delta = (float)fabs(r - outptr[vi]);
-                if (delta > MAX_ERR)
+                t = (conv_to_dbl(x[vi]) - conv_to_dbl(edge0[i]))
+                    / (conv_to_dbl(edge1[i]) - conv_to_dbl(edge0[i]));
+                if (t < 0.0)
+                    t = 0.0;
+                else if (t > 1.0)
+                    t = 1.0;
+                r = t * t * (3.0 - 2.0 * t);
+                delta = (float)fabs(r - conv_to_dbl(outptr[vi]));
+
+                if (!std::is_same<T, half>::value)
                 {
-                    log_error("{%d, element %d}) verification error: "
-                              "smoothstep(%a, %a, %a) = *%a vs. %a\n",
-                              ii, j, x[vi], edge0[i], edge1[i], r, outptr[vi]);
-                    return -1;
+                    if (delta > MAX_ERR)
+                    {
+                        log_error("{%d, element %d}) verification error: "
+                                  "smoothstep(%a, %a, %a) = *%a vs. %a\n",
+                                  ii, j, x[vi], edge0[i], edge1[i], r,
+                                  outptr[vi]);
+                        return -1;
+                    }
                 }
+                else
+                    max_delta = std::max(max_delta, delta);
             }
         }
     }
+
+    // due to the fact that accuracy of smoothstep for cl_khr_fp16 is
+    // implementation defined this test only reports maximum error without
+    // testing maximum error threshold
+    if (std::is_same<T, half>::value)
+        log_error("smoothstep half verification result, max delta: %a\n",
+                  max_delta);
+
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_smoothstep_fn(cl_device_id device, cl_context context,
-                       cl_command_queue queue, int n_elems, bool vecParam)
+                       cl_command_queue queue, const int n_elems,
+                       const bool vecParam)
 {
     clMemWrapper streams[4];
     std::vector<T> input_ptr[3], output_ptr;
@@ -170,6 +192,17 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             input_ptr[2][i] = get_random_double(-0x20000000, 0x20000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-65503, 65503, d));
+            input_ptr[1][i] = conv_to_half(
+                get_random_float(conv_to_flt(input_ptr[0][i]), 65503, d));
+            input_ptr[2][i] = conv_to_half(get_random_float(-65503, 65503, d));
+        }
+    }
 
     for (i = 0; i < 3; i++)
     {
@@ -179,7 +212,7 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
         test_error(err, "Unable to write input buffer");
     }
 
-    char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+    const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
     for (i = 0; i < kTotalVecCount; i++)
     {
@@ -190,15 +223,15 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             {
                 std::string str = smoothstep_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = smoothstep_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str(), tname.c_str());
             }
         }
         else
@@ -206,11 +239,12 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
             // regular path
             std::string str = smoothstep_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
+
         const char *programPtr = kernelSource.c_str();
         err =
             create_single_kernel_helper(context, &programs[i], &kernels[i], 1,
@@ -259,10 +293,15 @@ int test_smoothstep_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int SmoothstepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_smoothstep_fn<half>(device, context, queue, num_elems,
+                                         vecParam);
+        test_error(error, "SmoothstepTest::Run<cl_half> failed");
+    }
 
     error =
         test_smoothstep_fn<float>(device, context, queue, num_elems, vecParam);
@@ -278,7 +317,6 @@ cl_int SmoothstepTest::Run()
     return error;
 }
 
-
 int test_smoothstep(cl_device_id device, cl_context context,
                     cl_command_queue queue, int n_elems)
 {
@@ -286,7 +324,6 @@ int test_smoothstep(cl_device_id device, cl_context context,
                                           "smoothstep", true);
 }
 
-
 int test_smoothstepf(cl_device_id device, cl_context context,
                      cl_command_queue queue, int n_elems)
 {
diff --git a/test_conformance/commonfns/test_step.cpp b/test_conformance/commonfns/test_step.cpp
index dc91766e9..1cfa96eab 100644
--- a/test_conformance/commonfns/test_step.cpp
+++ b/test_conformance/commonfns/test_step.cpp
@@ -18,10 +18,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "harness/stringHelpers.h"
+
 #include "procs.h"
 #include "test_base.h"
 
-
 const char *step_fn_code_pattern = "%s\n" /* optional pragma */
                                    "__kernel void test_fn(__global %s%s *edge, "
                                    "__global %s%s *x, __global %s%s *dst)\n"
@@ -48,7 +49,6 @@ const char *step_fn_code_pattern_v3_scalar =
     "    vstore3(step(edge[tid], vload3(tid,x)), tid, dst);\n"
     "}\n";
 
-
 namespace {
 
 template <typename T>
@@ -62,8 +62,8 @@ int verify_step(const T *const inptrA, const T *const inptrB,
     {
         for (int i = 0; i < n * veclen; i++)
         {
-            r = (inptrB[i] < inptrA[i]) ? 0.0 : 1.0;
-            if (r != outptr[i]) return -1;
+            r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[i])) ? 0.0 : 1.0;
+            if (r != conv_to_dbl(outptr[i])) return -1;
         }
     }
     else
@@ -73,24 +73,31 @@ int verify_step(const T *const inptrA, const T *const inptrB,
             int ii = i / veclen;
             for (int j = 0; j < veclen && i < n; ++j, ++i)
             {
-                r = (inptrB[i] < inptrA[ii]) ? 0.0f : 1.0f;
-                if (r != outptr[i])
+                r = (conv_to_dbl(inptrB[i]) < conv_to_dbl(inptrA[ii])) ? 0.0f
+                                                                       : 1.0f;
+                if (r != conv_to_dbl(outptr[i]))
                 {
-                    log_error("Failure @ {%d, element %d}: step(%a,%a) -> *%a "
-                              "vs %a\n",
-                              ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
+                    if (std::is_same<T, half>::value)
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, conv_to_flt(inptrA[ii]),
+                            conv_to_flt(inptrB[i]), r, conv_to_flt(outptr[i]));
+                    else
+                        log_error(
+                            "Failure @ {%d, element %d}: step(%a,%a) -> *%a "
+                            "vs %a\n",
+                            ii, j, inptrA[ii], inptrB[i], r, outptr[i]);
                     return -1;
                 }
             }
         }
     }
-
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_step_fn(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems, bool vecParam)
@@ -140,6 +147,16 @@ int test_step_fn(cl_device_id device, cl_context context,
             input_ptr[1][i] = get_random_double(-0x40000000, 0x40000000, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        const float fval = CL_HALF_MAX;
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (i = 0; i < num_elements; i++)
+        {
+            input_ptr[0][i] = conv_to_half(get_random_float(-fval, fval, d));
+            input_ptr[1][i] = conv_to_half(get_random_float(-fval, fval, d));
+        }
+    }
 
     for (i = 0; i < 2; i++)
     {
@@ -160,15 +177,15 @@ int test_step_fn(cl_device_id device, cl_context context,
             {
                 std::string str = step_fn_code_pattern_v3;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
             else
             {
                 std::string str = step_fn_code_pattern_v3_scalar;
                 kernelSource =
-                    string_format(str, pragma_str.c_str(), tname.c_str(),
-                                  tname.c_str(), tname.c_str());
+                    str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                tname.c_str(), tname.c_str());
             }
         }
         else
@@ -176,9 +193,9 @@ int test_step_fn(cl_device_id device, cl_context context,
             // regular path
             std::string str = step_fn_code_pattern;
             kernelSource =
-                string_format(str, pragma_str.c_str(), tname.c_str(),
-                              vecParam ? vecSizeNames[i] : "", tname.c_str(),
-                              vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
+                str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                            vecParam ? vecSizeNames[i] : "", tname.c_str(),
+                            vecSizeNames[i], tname.c_str(), vecSizeNames[i]);
         }
         const char *programPtr = kernelSource.c_str();
         err =
@@ -229,10 +246,14 @@ int test_step_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int StepTest::Run()
 {
     cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_step_fn<half>(device, context, queue, num_elems, vecParam);
+        test_error(error, "StepTest::Run<cl_half> failed");
+    }
 
     error = test_step_fn<float>(device, context, queue, num_elems, vecParam);
     test_error(error, "StepTest::Run<float> failed");
@@ -247,7 +268,6 @@ cl_int StepTest::Run()
     return error;
 }
 
-
 int test_step(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
@@ -255,7 +275,6 @@ int test_step(cl_device_id device, cl_context context, cl_command_queue queue,
                                     true);
 }
 
-
 int test_stepf(cl_device_id device, cl_context context, cl_command_queue queue,
                int n_elems)
 {
diff --git a/test_conformance/commonfns/test_unary_fn.cpp b/test_conformance/commonfns/test_unary_fn.cpp
index fed4389d9..91b5c215b 100644
--- a/test_conformance/commonfns/test_unary_fn.cpp
+++ b/test_conformance/commonfns/test_unary_fn.cpp
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "harness/deviceInfo.h"
+#include "harness/stringHelpers.h"
 #include "harness/typeWrappers.h"
 
 #include "procs.h"
@@ -30,7 +31,6 @@
 #define M_PI 3.14159265358979323846264338327950288
 #endif
 
-
 // clang-format off
 const char *unary_fn_code_pattern =
 "%s\n" /* optional pragma */
@@ -51,23 +51,10 @@ const char *unary_fn_code_pattern_v3 =
 "}\n";
 // clang-format on
 
-
 #define MAX_ERR 2.0f
 
 namespace {
 
-
-template <typename T> float UlpFn(const T &val, const double &r)
-{
-    if (std::is_same<T, double>::value)
-        return Ulp_Error_Double(val, r);
-    else if (std::is_same<T, float>::value)
-        return Ulp_Error(val, r);
-    else if (std::is_same<T, half>::value)
-        return Ulp_Error(val, r);
-}
-
-
 template <typename T>
 int verify_degrees(const T *const inptr, const T *const outptr, int n)
 {
@@ -77,7 +64,11 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (180.0 / M_PI) * inptr[i];
+        r = (180.0 / M_PI) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
 
         error = UlpFn(outptr[i], r);
 
@@ -88,21 +79,32 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_radians(const T *const inptr, const T *const outptr, int n)
 {
@@ -112,8 +114,14 @@ int verify_radians(const T *const inptr, const T *const outptr, int n)
 
     for (int i = 0, j = 0; i < n; i++, j++)
     {
-        r = (M_PI / 180.0) * inptr[i];
-        error = Ulp_Error(outptr[i], r);
+        r = (M_PI / 180.0) * conv_to_dbl(inptr[i]);
+
+        if (std::is_same<T, half>::value)
+            if (!isfinite_fp(conv_to_half(r)) && !isfinite_fp(outptr[i]))
+                continue;
+
+        error = UlpFn(outptr[i], r);
+
         if (fabsf(error) > max_error)
         {
             max_error = error;
@@ -121,41 +129,51 @@ int verify_radians(const T *const inptr, const T *const outptr, int n)
             max_val = r;
             if (fabsf(error) > MAX_ERR)
             {
-                log_error("%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n",
-                          i, inptr[i], r, outptr[i], r, outptr[i], error);
+                if (std::is_same<T, half>::value)
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        conv_to_flt(inptr[i]), r, conv_to_flt(outptr[i]), r,
+                        conv_to_flt(outptr[i]), error);
+                else
+                    log_error(
+                        "%d) Error @ %a: *%a vs %a  (*%g vs %g) ulps: %f\n", i,
+                        inptr[i], r, outptr[i], r, outptr[i], error);
                 return 1;
             }
         }
     }
 
-    log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-             max_error, max_index, max_val, outptr[max_index], max_val,
-             outptr[max_index]);
+    if (std::is_same<T, half>::value)
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
+                 max_val, conv_to_flt(outptr[max_index]));
+    else
+        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
+                 max_error, max_index, max_val, outptr[max_index], max_val,
+                 outptr[max_index]);
 
     return 0;
 }
 
-
 template <typename T>
 int verify_sign(const T *const inptr, const T *const outptr, int n)
 {
-    T r = 0;
+    double r = 0;
     for (int i = 0; i < n; i++)
     {
-        if (inptr[i] > 0.0f)
+        if (conv_to_dbl(inptr[i]) > 0.0f)
             r = 1.0;
-        else if (inptr[i] < 0.0f)
+        else if (conv_to_dbl(inptr[i]) < 0.0f)
             r = -1.0;
         else
             r = 0.0;
-        if (r != outptr[i]) return -1;
+        if (r != conv_to_dbl(outptr[i])) return -1;
     }
     return 0;
 }
 
 }
 
-
 template <typename T>
 int test_unary_fn(cl_device_id device, cl_context context,
                   cl_command_queue queue, int n_elems,
@@ -207,33 +225,38 @@ int test_unary_fn(cl_device_id device, cl_context context,
                 get_random_double(-100000.0 * M_PI, 100000.0 * M_PI, d);
         }
     }
+    else if (std::is_same<T, half>::value)
+    {
+        pragma_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+        for (int j = 0; j < num_elements; j++)
+        {
+            input_ptr[j] = conv_to_half(get_random_float(
+                (float)(-10000.f * M_PI), (float)(10000.f * M_PI), d));
+        }
+    }
 
     err = clEnqueueWriteBuffer(queue, streams[0], true, 0,
                                sizeof(T) * num_elements, &input_ptr.front(), 0,
                                NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-        log_error("clEnqueueWriteBuffer failed\n");
-        return -1;
-    }
+    test_error(err, "clEnqueueWriteBuffer failed\n");
 
     for (i = 0; i < kTotalVecCount; i++)
     {
         std::string kernelSource;
-        char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
+        const char vecSizeNames[][3] = { "", "2", "4", "8", "16", "3" };
 
         if (i >= kVectorSizeCount)
         {
             std::string str = unary_fn_code_pattern_v3;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         tname.c_str(), fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       tname.c_str(), fnName.c_str());
         }
         else
         {
             std::string str = unary_fn_code_pattern;
-            kernelSource = string_format(str, pragma_str.c_str(), tname.c_str(),
-                                         vecSizeNames[i], tname.c_str(),
-                                         vecSizeNames[i], fnName.c_str());
+            kernelSource = str_sprintf(str, pragma_str.c_str(), tname.c_str(),
+                                       vecSizeNames[i], tname.c_str(),
+                                       vecSizeNames[i], fnName.c_str());
         }
 
         /* Create kernels */
@@ -290,11 +313,18 @@ int test_unary_fn(cl_device_id device, cl_context context,
     return err;
 }
 
-
 cl_int DegreesTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_degrees<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_degrees<half>);
+        test_error(error, "DegreesTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_degrees<float>);
     test_error(error, "DegreesTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -307,11 +337,18 @@ cl_int DegreesTest::Run()
     return error;
 }
 
-
 cl_int RadiansTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_radians<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_radians<half>);
+        test_error(error, "RadiansTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_radians<float>);
     test_error(error, "RadiansTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -324,11 +361,18 @@ cl_int RadiansTest::Run()
     return error;
 }
 
-
 cl_int SignTest::Run()
 {
-    cl_int error = test_unary_fn<float>(device, context, queue, num_elems,
-                                        fnName.c_str(), verify_sign<float>);
+    cl_int error = CL_SUCCESS;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        error = test_unary_fn<half>(device, context, queue, num_elems,
+                                    fnName.c_str(), verify_sign<half>);
+        test_error(error, "SignTest::Run<cl_half> failed");
+    }
+
+    error = test_unary_fn<float>(device, context, queue, num_elems,
+                                 fnName.c_str(), verify_sign<float>);
     test_error(error, "SignTest::Run<float> failed");
 
     if (is_extension_available(device, "cl_khr_fp64"))
@@ -341,7 +385,6 @@ cl_int SignTest::Run()
     return error;
 }
 
-
 int test_degrees(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -349,7 +392,6 @@ int test_degrees(cl_device_id device, cl_context context,
                                        "degrees");
 }
 
-
 int test_radians(cl_device_id device, cl_context context,
                  cl_command_queue queue, int n_elems)
 {
@@ -357,7 +399,6 @@ int test_radians(cl_device_id device, cl_context context,
                                        "radians");
 }
 
-
 int test_sign(cl_device_id device, cl_context context, cl_command_queue queue,
               int n_elems)
 {
diff --git a/test_conformance/compiler/test_compile.cpp b/test_conformance/compiler/test_compile.cpp
index f3ee43122..d250bdd47 100644
--- a/test_conformance/compiler/test_compile.cpp
+++ b/test_conformance/compiler/test_compile.cpp
@@ -462,7 +462,7 @@ int test_large_multiple_embedded_headers(cl_context context, cl_device_id device
         header_names[i] = _strdup(buffer);
 
         sprintf(buffer, composite_kernel_extern_template, i);
-        const char* line = _strdup(buffer);
+        const char *line = buffer;
         error = create_single_kernel_helper_create_program(context, &headers[i], 1, &line);
         if( headers[i] == NULL || error != CL_SUCCESS )
         {
diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index d53af8dc7..89626b797 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -76,6 +76,7 @@ const char *known_extensions[] = {
     "cl_khr_device_uuid",
     "cl_khr_pci_bus_info",
     "cl_khr_suggested_local_work_size",
+    "cl_khr_expect_assume",
     "cl_khr_spirv_linkonce_odr",
     "cl_khr_semaphore",
     "cl_khr_external_semaphore",
diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index dff9788c7..851696406 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -15,7 +15,6 @@
 //
 #include "harness/testHarness.h"
 #include "harness/compat.h"
-#include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 
 #if defined(__APPLE__)
@@ -102,6 +101,7 @@ MTdata gMTdata;
 const char **argList = NULL;
 int argCount = 0;
 
+
 double SubtractTime(uint64_t endTime, uint64_t startTime);
 
 cl_half_rounding_mode DataInitInfo::halfRoundingMode = CL_HALF_RTE;
@@ -264,6 +264,7 @@ std::vector<double> DataInitInfo::specialValuesDouble = {
 };
 // clang-format on
 
+
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
 // convert long and ulong to float and double or otherwise deal with values
@@ -351,6 +352,7 @@ int CalcRefValsPat<InType, OutType, InFP, OutFP>::check_result(void *test,
     return 0;
 }
 
+
 cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
     if (0 == (x & (x - 1))) return x;
@@ -360,6 +362,7 @@ cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
     return x + x;
 }
 
+
 cl_int CustomConversionsTest::Run()
 {
     int startMinVectorSize = gMinVectorSize;
@@ -391,8 +394,7 @@ cl_int CustomConversionsTest::Run()
             continue;
         }
 
-
-        // skip double if we don't have it
+        // skip half if we don't have it
         if (!gTestHalfs && (inType == khalf || outType == khalf))
         {
             if (gHasHalfs)
@@ -400,7 +402,7 @@ cl_int CustomConversionsTest::Run()
                 vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                            gTypeNames[outType], gSaturationNames[sat],
                            gRoundingModeNames[round], gTypeNames[inType]);
-                vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
+                vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
                      "off.\n");
             }
             continue;
@@ -440,6 +442,7 @@ cl_int CustomConversionsTest::Run()
     return gFailCount;
 }
 
+
 ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
                                  cl_command_queue queue)
     : context(context), device(device), queue(queue), num_elements(0),
@@ -448,6 +451,7 @@ ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
                      cl_double(0), cl_ulong(0), cl_long(0) })
 {}
 
+
 cl_int ConversionsTest::Run()
 {
     IterOverTypes iter(typeIterator, *this);
@@ -457,6 +461,7 @@ cl_int ConversionsTest::Run()
     return gFailCount;
 }
 
+
 cl_int ConversionsTest::SetUp(int elements)
 {
     num_elements = elements;
@@ -474,7 +479,7 @@ cl_int ConversionsTest::SetUp(int elements)
             DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
             ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTZ;
         }
-        else // CL_FP_ROUND_TO_INF ??
+        else
         {
             log_error("Error while acquiring half rounding mode");
             return TEST_FAIL;
@@ -542,7 +547,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                 continue;
             }
 
-            // skip double if we don't have it
+            // skip half if we don't have it
             if (!gTestHalfs && (inType == khalf || outType == khalf))
             {
                 if (gHasHalfs)
@@ -550,7 +555,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                     vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                                gTypeNames[outType], gSaturationNames[sat],
                                gRoundingModeNames[round], gTypeNames[inType]);
-                    vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
+                    vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
                          "off.\n");
                 }
                 continue;
@@ -587,7 +592,6 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
     cl_ulong wall_start = mach_absolute_time();
 #endif
 
-    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
     cl_uint threads = GetThreadCount();
 
     DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
@@ -655,7 +659,9 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
 
     // Figure out how many elements are in a work block
     // we handle 64-bit types a bit differently.
-    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
+    uint64_t lastCase = (8 * gTypeSizes[inType] > 32)
+        ? 0x100000000ULL
+        : 1ULL << (8 * gTypeSizes[inType]);
 
     if (!gWimpyMode && gIsEmbedded)
         step = blockCount * EMBEDDED_REDUCTION_FACTOR;
@@ -965,6 +971,7 @@ static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
         allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
 }
 
+
 void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &ptr);
 
 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
@@ -1005,6 +1012,7 @@ void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &info)
     // destroyed automatically soon after we exit.
 }
 
+
 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                              void *data)
 {
@@ -1233,7 +1241,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
             if (inType == kfloat || outType == kfloat)
                 setAllowZ((uint8_t *)a, (uint32_t *)s, count);
         }
-
         if (gForceHalfFTZ)
         {
             if (inType == khalf || outType == khalf)
@@ -1499,6 +1506,8 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
     return program;
 }
 
+//
+
 int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount)
 {
     // The global dimensions are just the blockCount to execute since we haven't
@@ -1524,6 +1533,7 @@ int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount)
     return 0;
 }
 
+
 int GetTestCase(const char *name, Type *outType, Type *inType,
                 SaturationMode *sat, RoundingMode *round)
 {
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index c4310646a..f5646fce0 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index 837b16772..2f408cf7e 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -53,7 +53,6 @@ typedef enum
     kSaturationModeCount
 } SaturationMode;
 
-
 struct DataInitInfo
 {
     cl_ulong start;
@@ -64,18 +63,15 @@ struct DataInitInfo
     RoundingMode round;
     cl_uint threads;
 
-
     static cl_half_rounding_mode halfRoundingMode;
     static std::vector<uint32_t> specialValuesUInt;
     static std::vector<float> specialValuesFloat;
     static std::vector<double> specialValuesDouble;
 };
 
-
 #define HFF(num) cl_half_from_float(num, DataInitInfo::halfRoundingMode)
 #define HTF(num) cl_half_to_float(num)
 
-
 struct DataInitBase : public DataInitInfo
 {
     virtual ~DataInitBase() = default;
@@ -86,7 +82,6 @@ struct DataInitBase : public DataInitInfo
     virtual void init(const cl_uint &, const cl_uint &) {}
 };
 
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct DataInfoSpec : public DataInitBase
 {
@@ -110,7 +105,6 @@ struct DataInfoSpec : public DataInitBase
 
     std::vector<MTdataHolder> mdv;
 
-
     constexpr bool is_in_half() const
     {
         return (std::is_same<InType, cl_half>::value && InFP);
@@ -135,7 +129,6 @@ struct DataInfoSpec : public DataInitBase
 
     void init(const cl_uint &, const cl_uint &) override;
     InType clamp(const InType &);
-
     inline float fclamp(float lo, float v, float hi)
     {
         v = v < lo ? lo : v;
@@ -175,16 +168,16 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
     else if (std::is_same<cl_long, OutType>::value)
         ranges = std::make_pair(CL_LONG_MIN, CL_LONG_MAX);
 
-    InType outMin = ((InType)ranges.first);
-    InType outMax = ((InType)ranges.second);
-
     // clang-format off
     // for readability sake keep this section unformatted
     if (std::is_floating_point<InType>::value)
     { // from float/double
+        InType outMin = static_cast<InType>(ranges.first);
+        InType outMax = static_cast<InType>(ranges.second);
+
         InType eps = std::is_same<InType, cl_float>::value ? (InType) FLT_EPSILON : (InType) DBL_EPSILON;
         if (std::is_integral<OutType>::value)
-        { // to char/uchar/short/ushort/half/int/uint/long/ulong
+        { // to char/uchar/short/ushort/int/uint/long/ulong/half
             if (sizeof(OutType)<=sizeof(cl_short))
             { // to char/uchar/short/ushort/half
                 clamp_ranges=
@@ -449,7 +442,9 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
                                              // always convert to +0.0
             }
 #else
-            *out = (*in == 0 ? 0.0 : (OutType)*in);
+            // Use volatile to prevent optimization by Clang compiler
+            volatile InType vi = *in;
+            *out = (vi == 0 ? 0.0 : static_cast<OutType>(vi));
 #endif
         }
         else if (std::is_same<cl_float, OutType>::value || is_out_half())
@@ -510,14 +505,23 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
     else
     {
         if (std::is_same<cl_float, OutType>::value)
-            *out = (*in == 0 ? 0.f : *in); // Per IEEE-754-2008 5.4.1, 0's
-                                           // always convert to +0.0
+        {
+            // Use volatile to prevent optimization by Clang compiler
+            volatile InType vi = *in;
+            // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0
+            *out = (vi == 0 ? 0.0f : vi);
+        }
         else if (std::is_same<cl_double, OutType>::value)
+        {
+            // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0
             *out = (*in == 0 ? 0.0 : *in);
+        }
         else if (is_out_half())
             *out = static_cast<OutType>(HFF(*in == 0 ? 0.f : *in));
         else
+        {
             *out = (OutType)*in;
+        }
     }
 }
 
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 88dca69fd..b7d6b0715 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 #include "harness/testHarness.h"
 #include "harness/parseParameters.h"
@@ -119,7 +118,6 @@ const int test_num = ARRAY_SIZE(test_list);
 int main(int argc, const char **argv)
 {
     int error;
-    cl_uint seed = (cl_uint)time(NULL);
 
     argc = parseCustomParam(argc, argv);
     if (argc == -1)
@@ -146,8 +144,8 @@ int main(int argc, const char **argv)
 #endif
 
     vlog("===========================================================\n");
-    vlog("Random seed: %u\n", seed);
-    gMTdata = init_genrand(seed);
+    vlog("Random seed: %u\n", gRandomSeed);
+    gMTdata = init_genrand(gRandomSeed);
 
     const char *arg[] = { argv[0] };
     int ret =
@@ -523,8 +521,6 @@ test_status InitCL(cl_device_id device)
         }
     }
 
-    gMTdata = init_genrand(gRandomSeed);
-
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
     vlog("\nCompute Device info:\n");
diff --git a/test_conformance/device_execution/enqueue_block.cpp b/test_conformance/device_execution/enqueue_block.cpp
index 29a6cec15..4ddd1db7f 100644
--- a/test_conformance/device_execution/enqueue_block.cpp
+++ b/test_conformance/device_execution/enqueue_block.cpp
@@ -27,561 +27,538 @@
 
 #ifdef CL_VERSION_2_0
 extern int gWimpyMode;
-static const char* enqueue_simple_block[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_simple_block(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  res[tid] = -1;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
 
-static const char* enqueue_block_with_local_arg1[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg1(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+// clang-format off
+static const char* enqueue_simple_block[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
 
-static const char* enqueue_block_with_local_arg2[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, ""
-    NL, "void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)"
-    NL, "{"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp1[i]   = mul * 7 - 21;"
-    NL, "    tmp2[i].x = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].y = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].z = (float)(mul * 7 - 21);"
-    NL, "    tmp2[i].w = (float)(mul * 7 - 21);"
-    NL, ""
-    NL, "    res[tid] += tmp1[i];"
-    NL, "    res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);"
-    NL, "  }"
-    NL, "  res[tid] += 2;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_local_arg2(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)"
-    NL, "    { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };"
-    NL, ""
-    NL, "  res[tid] = -2;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_simple_block(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_with_wait_list[] =
-{
-    NL, "#define BLOCK_SUBMITTED 1"
-    NL, "#define BLOCK_COMPLETED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,"
-    NL, "  ^{"
-    NL, "      res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
 
-static const char* enqueue_block_with_wait_list_and_local_arg[] =
-{
-    NL, "#define LOCAL_MEM_SIZE 10"
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define BLOCK_STARTED   3"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)"
-    NL, "{"
-    NL, "  res[tid] = BLOCK_STARTED;"
-    NL, "  for(int i = 0; i < LOCAL_MEM_SIZE; i++)"
-    NL, "  {"
-    NL, "    tmp[i] = mul * 7 - 21;"
-    NL, "    res[tid] += tmp[i];"
-    NL, "  }"
-    NL, "  if(res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  clk_event_t block_evt;"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt, "
-    NL, "    ^(__local void* buf) {"
-    NL, "       block_fn_local_arg(tid, multiplier, res, (__local int*)buf);"
-    NL, "     }, LOCAL_MEM_SIZE*sizeof(int));"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(block_evt);"
-    NL, "  release_event(block_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(user_evt);"
-    NL, "  release_event(block_evt);"
-    NL, "}"
-    NL
-};
+      res[tid] = -1;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_block_get_kernel_work_group_size[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_work_group_size(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_work_group_size(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+static const char* enqueue_block_with_local_arg1[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  res[tid] = mul * 7 - 21;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "    size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);"
-    NL, "    if (local_work_size <= 0){ res[tid] = -1; return; }"
-    NL, "    size_t global_work_size = local_work_size * 4;"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t q1 = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_block_capture_event_profiling_info_after_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const clk_event_t evt, __global int* res)"
-    NL, "{"
-    NL, "    capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "    release_event(evt);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)"
-    NL, "{"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg1(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_capture_event_profiling_info_before_execution[] =
-{
-    NL, "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS)
-    NL, ""
-    NL, "__global ulong value[MAX_GWS*2] = {0};"
-    NL, ""
-    NL, "void block_fn(size_t tid, __global int* res)"
-    NL, "{"
-    NL, "    res[tid] = -2;"
-    NL, "}"
-    NL, ""
-    NL, "void check_res(size_t tid, const ulong *value, __global int* res)"
-    NL, "{"
-    NL, "    if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;"
-    NL, "    else                                        res[tid] = -4;"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)"
-    NL, "{"
-    NL, "    int multiplier = 3;"
-    NL, "    size_t tid = get_global_id(0);"
-    NL, "    clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "    res[tid] = -1;"
-    NL, "    queue_t def_q = get_default_queue();"
-    NL, "    ndrange_t ndrange = ndrange_1D(1);"
-    NL, "    clk_event_t block_evt1;"
-    NL, "    clk_event_t block_evt2;"
-    NL, ""
-    NL, "    void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };"
-    NL, ""
-    NL, "    int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "    capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);"
-    NL, ""
-    NL, "    set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "    void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };"
-    NL, ""
-    NL, "    enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "    release_event(user_evt);"
-    NL, "    release_event(block_evt1);"
-    NL, "    release_event(block_evt2);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(__local void*) = ^(__local void* buf){ block_fn_local_arg1(tid, multiplier, res, (local int*)buf); };
 
-static const char* enqueue_block_with_barrier[] =
-{
-    NL, "void block_fn(size_t tid, int mul, __global int* res)"
-    NL, "{"
-    NL, "  if(mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "  res[tid] = mul * 7 -21;"
-    NL, "}"
-    NL, ""
-    NL, "void loop_fn(size_t tid, int n, __global int* res)"
-    NL, "{"
-    NL, "  while(n > 0)"
-    NL, "  {"
-    NL, "    barrier(CLK_GLOBAL_MEM_FENCE);"
-    NL, "    res[tid] = 0;"
-    NL, "    --n;"
-    NL, "  }"
-    NL, "}"
-    NL, ""
-    NL, "kernel void enqueue_block_with_barrier(__global int* res)"
-    NL, "{"
-    NL, "  int multiplier = 3;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  res[tid] = -1;"
-    NL, "  size_t n = 256;"
-    NL, ""
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };"
-    NL, ""
-    NL, "  ndrange_t ndrange = ndrange_1D(n);"
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "}"
-    NL
-};
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
 
-static const char* enqueue_marker_with_block_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_block_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t block_evt1;"
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt1);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+static const char* enqueue_block_with_local_arg2[] = { R"(
+    #define LOCAL_MEM_SIZE 10
 
-static const char* enqueue_marker_with_user_event[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_user_event(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  uint multiplier = 7;"
-    NL, ""
-    NL, "  clk_event_t user_evt = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, "  clk_event_t block_evt;"
-    NL, ""
-    NL, "  int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt, "
-    NL, "  ^{"
-    NL, "     if(res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;"
-    NL, "   });"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  set_user_event_status(user_evt, CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(block_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(user_evt);"
-    NL, "}"
-    NL
-};
+    void block_fn_local_arg1(size_t tid, int mul, __global int* res, __local int* tmp1, __local float4* tmp2)
+    {
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp1[i]   = mul * 7 - 21;
+        tmp2[i].x = (float)(mul * 7 - 21);
+        tmp2[i].y = (float)(mul * 7 - 21);
+        tmp2[i].z = (float)(mul * 7 - 21);
+        tmp2[i].w = (float)(mul * 7 - 21);
+
+        res[tid] += tmp1[i];
+        res[tid] += (int)(tmp2[i].x+tmp2[i].y+tmp2[i].z+tmp2[i].w);
+      }
+      res[tid] += 2;
+    }
 
-static const char* enqueue_marker_with_mixed_events[] =
-{
-    NL, "#define BLOCK_COMPLETED 1"
-    NL, "#define BLOCK_SUBMITTED 2"
-    NL, "#define CHECK_SUCCESS   0"
-    NL, ""
-    NL, "kernel void enqueue_marker_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, ""
-    NL, "  clk_event_t mix_ev[2];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, ""
-    NL, "  res[tid] = BLOCK_SUBMITTED;"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, ""
-    NL, "  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],"
-    NL, "  ^{"
-    NL, "     res[tid] = BLOCK_COMPLETED;"
-    NL, "   });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -2; return; }"
-    NL, ""
-    NL, "  clk_event_t marker_evt;"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  retain_event(marker_evt);"
-    NL, "  release_event(marker_evt);"
-    NL, ""
-    NL, "  //check block is not started"
-    NL, "  if(res[tid] == BLOCK_SUBMITTED)"
-    NL, "  {"
-    NL, "    clk_event_t my_evt;"
-    NL, "    enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt, "
-    NL, "    ^{"
-    NL, "       //check block is completed"
-    NL, "       if(res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;"
-    NL, "     });"
-    NL, "    release_event(my_evt);"
-    NL, "  }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(marker_evt);"
-    NL, "  release_event(mix_ev[0]);"
-    NL, "}"
-    NL
-};
+    kernel void enqueue_block_with_local_arg2(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
 
-static const char* enqueue_block_with_mixed_events[] =
-{
-    NL, "kernel void enqueue_block_with_mixed_events(__global int* res)"
-    NL, "{"
-    NL, "  int enq_res;"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  clk_event_t mix_ev[3];"
-    NL, "  mix_ev[0] = create_user_event();"
-    NL, "  queue_t def_q = get_default_queue();"
-    NL, "  ndrange_t ndrange = ndrange_1D(1);"
-    NL, "  res[tid] = -2;"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -3; return; }"
-    NL, ""
-    NL, "  enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });"
-    NL, "  if(enq_res != CLK_SUCCESS) { res[tid] = -4; return; }"
-    NL, ""
-    NL, "  set_user_event_status(mix_ev[0], CL_COMPLETE);"
-    NL, ""
-    NL, "  release_event(mix_ev[0]);"
-    NL, "  release_event(mix_ev[1]);"
-    NL, "  release_event(mix_ev[2]);"
-    NL, "}"
-    NL
-};
+      void (^kernelBlock)(__local void*, __local void*) = ^(__local void* buf1, __local void* buf2)
+        { block_fn_local_arg1(tid, multiplier, res, (local int*)buf1, (local float4*)buf2); };
+
+      res[tid] = -2;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock, (uint)(LOCAL_MEM_SIZE*sizeof(int)), (uint)(LOCAL_MEM_SIZE*sizeof(float4)));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list[] = { R"(
+    #define BLOCK_SUBMITTED 1
+    #define BLOCK_COMPLETED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_block_with_wait_list(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+      ^{
+          res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_with_wait_list_and_local_arg[] = { R"(
+    #define LOCAL_MEM_SIZE 10
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define BLOCK_STARTED   3
+    #define CHECK_SUCCESS   0
+
+    void block_fn_local_arg(size_t tid, int mul, __global int* res, __local int* tmp)
+    {
+      res[tid] = BLOCK_STARTED;
+      for (int i = 0; i < LOCAL_MEM_SIZE; i++)
+      {
+        tmp[i] = mul * 7 - 21;
+        res[tid] += tmp[i];
+      }
+      if (res[tid] == BLOCK_STARTED) res[tid] = BLOCK_COMPLETED;
+    }
+
+    kernel void enqueue_block_with_wait_list_and_local_arg(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      clk_event_t block_evt;
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt,
+        ^(__local void* buf) {
+           block_fn_local_arg(tid, multiplier, res, (__local int*)buf);
+         }, LOCAL_MEM_SIZE*sizeof(int));
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(block_evt);
+      release_event(block_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(user_evt);
+      release_event(block_evt);
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_work_group_size[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_work_group_size(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_work_group_size(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_get_kernel_preferred_work_group_size_multiple[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      res[tid] = mul * 7 - 21;
+    }
+
+    kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+
+        void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+        size_t local_work_size = get_kernel_preferred_work_group_size_multiple(kernelBlock);
+        if (local_work_size <= 0){ res[tid] = -1; return; }
+        size_t global_work_size = local_work_size * 4;
+
+        res[tid] = -1;
+        queue_t q1 = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(global_work_size, local_work_size);
+
+        int enq_res = enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_after_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const clk_event_t evt, __global int* res)
+    {
+        capture_event_profiling_info (evt, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+        release_event(evt);
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_after_execution(__global int* res)
+    {
+        size_t tid = get_global_id(0);
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, NULL, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, block_evt1, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, NULL, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+    }
+)" };
+
+static const char* enqueue_block_capture_event_profiling_info_before_execution[] = {
+    "#define MAX_GWS " STRINGIFY_VALUE(MAX_GWS) "\n"
+    , R"(
+    __global ulong value[MAX_GWS*2] = {0};
+
+    void block_fn(size_t tid, __global int* res)
+    {
+        res[tid] = -2;
+    }
+
+    void check_res(size_t tid, const ulong *value, __global int* res)
+    {
+        if (value[tid*2] > 0 && value[tid*2+1] > 0) res[tid] =  0;
+        else                                        res[tid] = -4;
+    }
+
+    kernel void enqueue_block_capture_event_profiling_info_before_execution(__global int* res)
+    {
+        int multiplier = 3;
+        size_t tid = get_global_id(0);
+        clk_event_t user_evt = create_user_event();
+
+        res[tid] = -1;
+        queue_t def_q = get_default_queue();
+        ndrange_t ndrange = ndrange_1D(1);
+        clk_event_t block_evt1;
+        clk_event_t block_evt2;
+
+        void (^kernelBlock)(void)  = ^{ block_fn (tid, res);                   };
+
+        int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1, kernelBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+        capture_event_profiling_info (block_evt1, CLK_PROFILING_COMMAND_EXEC_TIME, &value[tid*2]);
+
+        set_user_event_status(user_evt, CL_COMPLETE);
+
+        void (^checkBlock) (void)  = ^{ check_res(tid, &value, res);      };
+
+        enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &block_evt1, &block_evt2, checkBlock);
+        if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+        release_event(user_evt);
+        release_event(block_evt1);
+        release_event(block_evt2);
+    }
+)" };
+
+static const char* enqueue_block_with_barrier[] = { R"(
+    void block_fn(size_t tid, int mul, __global int* res)
+    {
+      if (mul > 0) barrier(CLK_GLOBAL_MEM_FENCE);
+      res[tid] = mul * 7 -21;
+    }
+
+    void loop_fn(size_t tid, int n, __global int* res)
+    {
+      while (n > 0)
+      {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        res[tid] = 0;
+        --n;
+      }
+    }
+
+    kernel void enqueue_block_with_barrier(__global int* res)
+    {
+      int multiplier = 3;
+      size_t tid = get_global_id(0);
+      queue_t def_q = get_default_queue();
+      res[tid] = -1;
+      size_t n = 256;
+
+      void (^kernelBlock)(void) = ^{ block_fn(tid, multiplier, res); };
+
+      ndrange_t ndrange = ndrange_1D(n);
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      void (^loopBlock)(void) = ^{ loop_fn(tid, n, res); };
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, loopBlock);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+    }
+)" };
+
+static const char* enqueue_marker_with_block_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_block_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t block_evt1;
+      clk_event_t marker_evt;
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &user_evt, &block_evt1,
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &block_evt1, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt1);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_user_event[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_user_event(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+      uint multiplier = 7;
+
+      clk_event_t user_evt = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      clk_event_t marker_evt;
+      clk_event_t block_evt;
+
+      int enq_res = enqueue_marker(def_q, 1, &user_evt, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &block_evt,
+      ^{
+         if (res[tid] == BLOCK_SUBMITTED) res[tid] = CHECK_SUCCESS;
+       });
+
+      //check block is not started
+      if (res[tid] != BLOCK_SUBMITTED)  { res[tid] = -2; return; }
+
+      set_user_event_status(user_evt, CL_COMPLETE);
+
+      release_event(block_evt);
+      release_event(marker_evt);
+      release_event(user_evt);
+    }
+)" };
+
+static const char* enqueue_marker_with_mixed_events[] = { R"(
+    #define BLOCK_COMPLETED 1
+    #define BLOCK_SUBMITTED 2
+    #define CHECK_SUCCESS   0
+
+    kernel void enqueue_marker_with_mixed_events(__global int* res)
+    {
+      size_t tid = get_global_id(0);
+
+      clk_event_t mix_ev[2];
+      mix_ev[0] = create_user_event();
+
+      res[tid] = BLOCK_SUBMITTED;
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1],
+      ^{
+         res[tid] = BLOCK_COMPLETED;
+       });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -2; return; }
+
+      clk_event_t marker_evt;
+
+      enq_res = enqueue_marker(def_q, 2, mix_ev, &marker_evt);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      retain_event(marker_evt);
+      release_event(marker_evt);
+
+      //check block is not started
+      if (res[tid] == BLOCK_SUBMITTED)
+      {
+        clk_event_t my_evt;
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &marker_evt, &my_evt,
+        ^{
+           //check block is completed
+           if (res[tid] == BLOCK_COMPLETED) res[tid] = CHECK_SUCCESS;
+         });
+        release_event(my_evt);
+      }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[1]);
+      release_event(marker_evt);
+      release_event(mix_ev[0]);
+    }
+)" };
+
+static const char* enqueue_block_with_mixed_events[] = { R"(
+    kernel void enqueue_block_with_mixed_events(__global int* res)
+    {
+      int enq_res;
+      size_t tid = get_global_id(0);
+      clk_event_t mix_ev[3];
+      mix_ev[0] = create_user_event();
+      queue_t def_q = get_default_queue();
+      ndrange_t ndrange = ndrange_1D(1);
+      res[tid] = -2;
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 1, &mix_ev[0], &mix_ev[1], ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -1; return; }
+
+      enq_res = enqueue_marker(def_q, 1, &mix_ev[1], &mix_ev[2]);
+      if (enq_res != CLK_SUCCESS) { res[tid] = -3; return; }
+
+      enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, sizeof(mix_ev)/sizeof(mix_ev[0]), mix_ev, NULL, ^{ res[tid]++; });
+      if (enq_res != CLK_SUCCESS) { res[tid] = -4; return; }
+
+      set_user_event_status(mix_ev[0], CL_COMPLETE);
+
+      release_event(mix_ev[0]);
+      release_event(mix_ev[1]);
+      release_event(mix_ev[2]);
+    }
+)" };
+// clang-format on
 
 static const kernel_src sources_enqueue_block[] =
 {
diff --git a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
index 4b9968c39..098fb5be6 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
@@ -14,6 +14,7 @@ set(${MODULE_NAME}_SOURCES
     command_buffer_test_copy.cpp
     command_buffer_test_barrier.cpp
     command_buffer_test_event_info.cpp
+    command_buffer_finalize.cpp
 )
 
 include(../../CMakeCommon.txt)
diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
index 43734da0a..6c02f9f78 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -201,14 +201,33 @@ struct BasicEnqueueTest : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        const cl_int new_pattern = 12;
+        error = clEnqueueFillBuffer(queue, in_mem, &new_pattern, sizeof(cl_int),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(new_pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
index e06258335..0d4dd0399 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/CMakeLists.txt
@@ -3,6 +3,12 @@ set(MODULE_NAME CL_KHR_MUTABLE_DISPATCH)
 set(${MODULE_NAME}_SOURCES
     main.cpp
     mutable_command_info.cpp
+    mutable_command_image_arguments.cpp
+    mutable_command_arguments.cpp
+    mutable_command_out_of_order.cpp
+    mutable_command_global_size.cpp
+    mutable_command_local_size.cpp
+    mutable_command_global_offset.cpp
     ../basic_command_buffer.cpp
 )
 
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
index 97075792b..a2fae4974 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/main.cpp
@@ -26,6 +26,18 @@ test_definition test_list[] = {
     ADD_TEST(mutable_command_info_global_work_offset),
     ADD_TEST(mutable_command_info_local_work_size),
     ADD_TEST(mutable_command_info_global_work_size),
+    ADD_TEST(mutable_dispatch_image_1d_arguments),
+    ADD_TEST(mutable_dispatch_image_2d_arguments),
+    ADD_TEST(mutable_dispatch_out_of_order),
+    ADD_TEST(mutable_dispatch_simultaneous_out_of_order),
+    ADD_TEST(mutable_dispatch_global_size),
+    ADD_TEST(mutable_dispatch_local_size),
+    ADD_TEST(mutable_dispatch_global_offset),
+    ADD_TEST(mutable_dispatch_svm_arguments),
+    ADD_TEST(mutable_dispatch_local_arguments),
+    ADD_TEST(mutable_dispatch_global_arguments),
+    ADD_TEST(mutable_dispatch_pod_arguments),
+    ADD_TEST(mutable_dispatch_null_arguments),
 };
 
 int main(int argc, const char *argv[])
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
new file mode 100644
index 000000000..5c8291f05
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
@@ -0,0 +1,847 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "testHarness.h"
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases for
+// CL_MUTABLE_DISPATCH_ARGUMENTS_KHR:
+// - __global arguments
+// - __local arguments
+// - plain-old-data arguments
+// - NULL arguments
+// - SVM arguments
+
+struct MutableDispatchGlobalArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchGlobalArguments(cl_device_id device, cl_context context,
+                                   cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        cl_int error;
+
+        // Create kernel
+
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst[tid] = src[tid];
+            })";
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        // Create and initialize buffers
+
+        MTdataHolder d(gRandomSeed);
+
+        std::vector<cl_int> srcData(num_elements);
+        for (size_t i = 0; i < num_elements; i++)
+            srcData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper srcBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                             num_elements * sizeof(cl_int),
+                                             srcData.data(), &error);
+        test_error(error, "Creating src buffer");
+
+        clMemWrapper dstBuf0 =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating initial dst buffer failed");
+
+        clMemWrapper dstBuf1 =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating updated dst buffer failed");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArg(kernel, 0, sizeof(srcBuf), &srcBuf);
+        test_error(error, "Unable to set src kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(dstBuf0), &dstBuf0);
+        test_error(error, "Unable to set initial dst kernel argument");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the initial execution
+
+        std::vector<cl_int> dstData0(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf0, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData0.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for initial dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData0[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData0[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg{ 1, sizeof(dstBuf1), &dstBuf1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            &arg /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        std::vector<cl_int> dstData1(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf1, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for modified dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData1[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData1[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+struct MutableDispatchLocalArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchLocalArguments(cl_device_id device, cl_context context,
+                                  cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src1, __local int
+            *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                src[tid] = src1[tid];
+                dst[tid] = src[tid];
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        size_t threads[1], localThreads[1];
+        std::vector<cl_int> constantData;
+        std::vector<cl_int> resultData;
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        size_t sizeToAllocate =
+            ((size_t)max_size / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData.resize(sizeToAllocate / sizeof(cl_int));
+        resultData.resize(sizeToAllocate / sizeof(cl_int));
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            constantData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper streams[2];
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
+                           constantData.data(), &error);
+        test_error(error, "Creating test array failed");
+        streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                    nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error =
+            clSetKernelArg(kernel, 1, numberOfInts * sizeof(cl_int), nullptr);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        threads[0] = numberOfInts;
+        localThreads[0] = 1;
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, threads,
+            localThreads, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(cl_mem), nullptr };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error =
+            clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate,
+                                resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
+                          i, constantData[i], i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchPODArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchPODArguments(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(
+                __kernel void sample_test(__constant int *src, int dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst = src[tid];
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        size_t threads[1], localThreads[1];
+        std::vector<cl_int> constantData;
+        std::vector<cl_int> resultData;
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        size_t sizeToAllocate =
+            ((size_t)max_size / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData.resize(sizeToAllocate / sizeof(cl_int));
+        resultData.resize(sizeToAllocate / sizeof(cl_int));
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            constantData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
+                                constantData.data(), &error);
+        test_error(error, "Creating test array failed");
+
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+        cl_int intarg = 10;
+        error = clSetKernelArg(kernel, 1, sizeof(cl_int), &intarg);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        threads[0] = numberOfInts;
+        localThreads[0] = 1;
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, threads,
+            localThreads, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        intarg = 20;
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(cl_int), &intarg };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < numberOfInts; i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
+                          i, constantData[i], i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchNullArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchNullArguments(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    cl_int Run() override
+    {
+        cl_int error;
+
+        // Create kernel
+
+        const char *sample_const_arg_kernel =
+            R"(
+            __kernel void sample_test(__constant int *src, __global int *dst)
+            {
+                size_t  tid = get_global_id(0);
+                dst[tid] = src ? src[tid] : 12345;
+            })";
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        MTdataHolder d(gRandomSeed);
+
+        std::vector<cl_int> srcData(num_elements);
+        for (size_t i = 0; i < num_elements; i++)
+            srcData[i] = (cl_int)genrand_int32(d);
+
+        clMemWrapper srcBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                             num_elements * sizeof(cl_int),
+                                             srcData.data(), &error);
+        test_error(error, "Creating src buffer");
+
+        clMemWrapper dstBuf =
+            clCreateBuffer(context, CL_MEM_READ_WRITE,
+                           num_elements * sizeof(cl_int), NULL, &error);
+        test_error(error, "Creating dst buffer failed");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArg(kernel, 0, sizeof(srcBuf), &srcBuf);
+        test_error(error, "Unable to set src kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(dstBuf), &dstBuf);
+        test_error(error, "Unable to set initial dst kernel argument");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the initial execution
+
+        std::vector<cl_int> dstData0(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData0.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for initial dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (srcData[i] != dstData0[i])
+            {
+                log_error("Initial data failed to verify: src[%zu]=%d != "
+                          "dst[%zu]=%d\n",
+                          i, srcData[i], i, dstData0[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg{ 0, sizeof(cl_mem), nullptr };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            &arg /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        std::vector<cl_int> dstData1(num_elements);
+        error = clEnqueueReadBuffer(queue, dstBuf, CL_TRUE, 0,
+                                    num_elements * sizeof(cl_int),
+                                    dstData1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer for modified dst failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (12345 != dstData1[i])
+            {
+                log_error("Modified data failed to verify: %d != dst[%zu]=%d\n",
+                          12345, i, dstData1[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+    const cl_ulong max_size = 16;
+};
+
+struct MutableDispatchSVMArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchSVMArguments(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_device_svm_capabilities svm_caps;
+        bool svm_capabilities =
+            !clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES,
+                             sizeof(svm_caps), &svm_caps, NULL)
+            && svm_caps != 0;
+
+        return !svm_capabilities || BasicMutableCommandBufferTest::Skip();
+    }
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        const char *svm_arguments_kernel =
+            R"(
+            typedef struct {
+                global int* ptr;
+            } wrapper;
+            __kernel void test_svm_arguments(__global wrapper* pWrapper)
+            {
+                size_t i = get_global_id(0);
+                pWrapper->ptr[i]++;
+            })";
+
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    &svm_arguments_kernel,
+                                    "test_svm_arguments");
+
+        return 0;
+    }
+
+    cl_int Run() override
+    {
+        const cl_int zero = 0;
+        cl_int error;
+
+        // Allocate and initialize SVM for initial execution
+
+        cl_int *initWrapper = (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE,
+                                                   sizeof(cl_int *), 0);
+        cl_int *initBuffer = (cl_int *)clSVMAlloc(
+            context, CL_MEM_READ_WRITE, num_elements * sizeof(cl_int), 0);
+        test_assert_error(initWrapper != nullptr && initBuffer != nullptr,
+                          "clSVMAlloc failed for initial execution");
+
+        error = clEnqueueSVMMemcpy(queue, CL_TRUE, initWrapper, &initBuffer,
+                                   sizeof(cl_int *), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed for initWrapper");
+
+        error = clEnqueueSVMMemFill(queue, initBuffer, &zero, sizeof(zero),
+                                    num_elements * sizeof(cl_int), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed for initBuffer");
+
+        // Allocate and initialize SVM for modified execution
+
+        cl_int *newWrapper =
+            (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0);
+        cl_int *newBuffer = (cl_int *)clSVMAlloc(
+            context, CL_MEM_READ_WRITE, num_elements * sizeof(cl_int), 0);
+        test_assert_error(newWrapper != nullptr && newBuffer != nullptr,
+                          "clSVMAlloc failed for modified execution");
+
+        error = clEnqueueSVMMemcpy(queue, CL_TRUE, newWrapper, &newBuffer,
+                                   sizeof(cl_int *), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMemcpy failed for newWrapper");
+
+        error = clEnqueueSVMMemFill(queue, newBuffer, &zero, sizeof(zero),
+                                    num_elements * sizeof(cl_int), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueSVMMemFill failed for newB");
+
+        // Build and execute the command buffer for the initial execution
+
+        error = clSetKernelArgSVMPointer(kernel, 0, initWrapper);
+        test_error(error, "clSetKernelArg failed for initWrapper");
+
+        error = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                                    sizeof(initBuffer), &initBuffer);
+        test_error(error, "clSetKernelExecInfo failed for initBuffer");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR
+                | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR,
+            0
+        };
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Check the results of the initial execution
+
+        error =
+            clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, initBuffer,
+                            num_elements * sizeof(cl_int), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMap failed for initBuffer");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (initBuffer[i] != 1)
+            {
+                log_error("Initial verification failed at index %zu: Got %d, "
+                          "wanted 1\n",
+                          i, initBuffer[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        error = clEnqueueSVMUnmap(queue, initBuffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMUnmap failed for initBuffer");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Modify and execute the command buffer
+
+        cl_mutable_dispatch_arg_khr arg_svm{};
+        arg_svm.arg_index = 0;
+        arg_svm.arg_value = newWrapper;
+
+        cl_mutable_dispatch_exec_info_khr exec_info{};
+        exec_info.param_name = CL_KERNEL_EXEC_INFO_SVM_PTRS;
+        exec_info.param_value_size = sizeof(newBuffer);
+        exec_info.param_value = &newBuffer;
+
+        cl_mutable_dispatch_config_khr dispatch_config{};
+        dispatch_config.type = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR;
+        dispatch_config.command = command;
+        dispatch_config.num_svm_args = 1;
+        dispatch_config.arg_svm_list = &arg_svm;
+        dispatch_config.num_exec_infos = 1;
+        dispatch_config.exec_info_list = &exec_info;
+
+        cl_mutable_base_config_khr mutable_config{};
+        mutable_config.type = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR;
+        mutable_config.num_mutable_dispatch = 1;
+        mutable_config.mutable_dispatch_list = &dispatch_config;
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        // Check the results of the modified execution
+
+        error =
+            clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, newBuffer,
+                            num_elements * sizeof(cl_int), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMMap failed for newBuffer");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            if (newBuffer[i] != 1)
+            {
+                log_error("Modified verification failed at index %zu: Got %d, "
+                          "wanted 1\n",
+                          i, newBuffer[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        error = clEnqueueSVMUnmap(queue, newBuffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueSVMUnmap failed for newBuffer");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        // Clean up
+
+        clSVMFree(context, initWrapper);
+        clSVMFree(context, initBuffer);
+        clSVMFree(context, newWrapper);
+        clSVMFree(context, newBuffer);
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+
+int test_mutable_dispatch_local_arguments(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchLocalArguments>(device, context, queue,
+                                                         num_elements);
+}
+
+int test_mutable_dispatch_global_arguments(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchGlobalArguments>(device, context,
+                                                          queue, num_elements);
+}
+
+int test_mutable_dispatch_pod_arguments(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchPODArguments>(device, context, queue,
+                                                       num_elements);
+}
+
+int test_mutable_dispatch_null_arguments(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchNullArguments>(device, context, queue,
+                                                        num_elements);
+}
+
+int test_mutable_dispatch_svm_arguments(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchSVMArguments>(device, context, queue,
+                                                       num_elements);
+}
\ No newline at end of file
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
index 966695834..c88c14d1c 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
@@ -19,6 +19,17 @@
 #include "../basic_command_buffer.h"
 #include "../command_buffer_test_base.h"
 
+// If it is supported get the addresses of all the APIs here.
+#define GET_EXTENSION_ADDRESS(FUNC)                                            \
+    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
+        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
+    if (FUNC == nullptr)                                                       \
+    {                                                                          \
+        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
+                  " with " #FUNC "\n");                                        \
+        return TEST_FAIL;                                                      \
+    }
+
 struct BasicMutableCommandBufferTest : BasicCommandBufferTest
 {
     BasicMutableCommandBufferTest(cl_device_id device, cl_context context,
@@ -84,24 +95,52 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest
                             &platform, nullptr);
         test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
 
-        // If it is supported get the addresses of all the APIs here.
-#define GET_EXTENSION_ADDRESS(FUNC)                                            \
-    FUNC = reinterpret_cast<FUNC##_fn>(                                        \
-        clGetExtensionFunctionAddressForPlatform(platform, #FUNC));            \
-    if (FUNC == nullptr)                                                       \
-    {                                                                          \
-        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed"     \
-                  " with " #FUNC "\n");                                        \
-        return TEST_FAIL;                                                      \
+        GET_EXTENSION_ADDRESS(clUpdateMutableCommandsKHR);
+
+        return CL_SUCCESS;
     }
+
+    clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
+
+    const char* kernelString = "__kernel void empty() {}";
+    const size_t global_work_size = 4 * 16;
+};
+
+struct InfoMutableCommandBufferTest : BasicMutableCommandBufferTest
+{
+    InfoMutableCommandBufferTest(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        cl_int error = init_extension_functions();
+        test_error(error, "Unable to initialise extension functions");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int init_extension_functions()
+    {
+        BasicCommandBufferTest::init_extension_functions();
+
+        cl_platform_id platform;
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                            &platform, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
         GET_EXTENSION_ADDRESS(clGetMutableCommandInfoKHR);
 
         return CL_SUCCESS;
     }
 
     clGetMutableCommandInfoKHR_fn clGetMutableCommandInfoKHR = nullptr;
-    const char* kernelString = "__kernel void empty() {}";
-    const size_t global_work_size = 4 * sizeof(cl_int);
 };
 
-#endif // CL_KHR_MUTABLE_COMMAND_BASIC_H
+#undef GET_EXTENSION_ADDRESS
+
+#endif //_CL_KHR_MUTABLE_COMMAND_BASIC_H
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
new file mode 100644
index 000000000..80bc015a3
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_offset.cpp
@@ -0,0 +1,170 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR
+
+struct MutableDispatchGlobalOffset : InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchGlobalOffset(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *global_offset_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_global_offset(0);
+            })";
+
+        cl_int error =
+            create_single_kernel_helper(context, &program, &kernel, 1,
+                                        &global_offset_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            &update_global_offset /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR,
+            sizeof(info_global_offset), &info_global_offset, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_global_offset != update_global_offset)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i < update_global_offset && 0 != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i >= update_global_offset
+                     && update_global_offset != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_offset != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+        return CL_SUCCESS;
+    }
+
+    size_t info_global_offset = 0;
+    const size_t update_global_offset = 3;
+    const size_t sizeToAllocate =
+        (global_work_size + update_global_offset) * sizeof(cl_int);
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_global_offset(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
+{
+
+    return MakeAndRunTest<MutableDispatchGlobalOffset>(device, context, queue,
+                                                       num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp
new file mode 100644
index 000000000..091f0c8d3
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_global_size.cpp
@@ -0,0 +1,167 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR
+
+struct MutableDispatchGlobalSize : public InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchGlobalSize(cl_device_id device, cl_context context,
+                              cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *global_size_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_global_size(0);
+            })";
+
+        cl_int error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &global_size_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            &update_global_size /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR,
+            sizeof(info_global_size), &info_global_size, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_global_size != update_global_size)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i >= update_global_size && global_work_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i < update_global_size
+                     && update_global_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_global_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return CL_SUCCESS;
+    }
+
+    size_t info_global_size = 0;
+    const size_t update_global_size = 3;
+    const size_t sizeToAllocate = global_work_size;
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_global_size(cl_device_id device, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchGlobalSize>(device, context, queue,
+                                                     num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
new file mode 100644
index 000000000..b1ce25ec1
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
@@ -0,0 +1,427 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <vector>
+#include "imageHelpers.h"
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases for
+// CL_MUTABLE_DISPATCH_ARGUMENTS_KHR:
+// - image arguments
+
+struct MutableDispatchImage1DArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchImage1DArguments(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        cl_bool image_support;
+
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+                            sizeof(image_support), &image_support, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
+
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+        return (!mutable_support || !image_support)
+            || BasicMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *sample_const_arg_kernel =
+            R"(__kernel void sample_test( read_only image1d_t source, sampler_t
+            sampler, write_only image1d_t dest)
+            {
+               int offset = get_global_id(0);
+
+               int4 color = read_imagei( source, sampler, offset );
+
+               write_imagei( dest, offset, color );
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        cl_image_desc image_desc;
+        memset(&image_desc, 0x0, sizeof(cl_image_desc));
+        image_desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        image_desc.image_width = 4;
+        image_desc.image_row_pitch = 0;
+        image_desc.num_mip_levels = 0;
+
+        const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
+
+        image_descriptor imageInfo = { 0 };
+        imageInfo.type = CL_MEM_OBJECT_IMAGE1D;
+        imageInfo.format = &formats;
+        imageInfo.width = 4;
+
+        BufferOwningPtr<char> imageValues_input, imageValues_output, outputData;
+        MTdataHolder d(gRandomSeed);
+        generate_random_image_data(&imageInfo, imageValues_input, d);
+        generate_random_image_data(&imageInfo, imageValues_output, d);
+        generate_random_image_data(&imageInfo, outputData, d);
+
+        char *host_ptr_input = (char *)imageValues_input;
+        char *host_ptr_output = (char *)imageValues_output;
+
+        clMemWrapper src_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_input, nullptr, &error);
+        test_error(error, "create_image_1d failed");
+
+        clMemWrapper dst_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_output, nullptr, &error);
+        test_error(error, "create_image_2d failed");
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clSamplerWrapper sampler = clCreateSampler(
+            context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        size_t globalDim[3] = { 4, 1, 1 }, localDim[3] = { 1, 1, 1 };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, globalDim,
+            localDim, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        clMemWrapper new_image = create_image_1d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, 0, host_ptr_output, nullptr, &error);
+        test_error(error, "create_image_1d failed");
+
+        cl_mutable_dispatch_arg_khr arg_2{ 2, sizeof(cl_mem), &new_image };
+        cl_mutable_dispatch_arg_khr args[] = { arg_2 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t region[3] = { image_desc.image_width, 1, 1 };
+
+        error = clEnqueueReadImage(queue, new_image, CL_TRUE, origin, region, 0,
+                                   0, outputData, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < imageInfo.width; ++i)
+        {
+            if (imageValues_input[i] != outputData[i])
+            {
+                log_error("Data failed to verify: imageValues[%d]=%d != "
+                          "outputData[%d]=%d\n",
+                          i, imageValues_input[i], i, outputData[i]);
+
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+struct MutableDispatchImage2DArguments : public BasicMutableCommandBufferTest
+{
+    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+
+    MutableDispatchImage2DArguments(cl_device_id device, cl_context context,
+                                    cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue)
+    {}
+
+    virtual cl_int SetUp(int elements) override
+    {
+        BasicMutableCommandBufferTest::SetUp(elements);
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        cl_bool image_support;
+
+        cl_int error =
+            clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+                            sizeof(image_support), &image_support, nullptr);
+        test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
+
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+        return (!mutable_support || !image_support)
+            || BasicMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+
+        const char *sample_const_arg_kernel =
+            R"(__kernel void sample_test( read_only image2d_t source, sampler_t
+            sampler, write_only image2d_t dest)
+            {
+               int x = get_global_id(0);
+               int y = get_global_id(1);
+
+               int4 color = read_imagei( source, sampler, (int2) (x, y) );
+
+               write_imagei( dest, (int2) (x, y), color );
+            })";
+
+        cl_int error;
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        cl_image_desc image_desc;
+        memset(&image_desc, 0x0, sizeof(cl_image_desc));
+        image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        image_desc.image_width = 4;
+        image_desc.image_height = 4;
+        image_desc.image_row_pitch = 0;
+        image_desc.num_mip_levels = 0;
+
+        size_t data_size =
+            image_desc.image_width * image_desc.image_height * sizeof(cl_int);
+
+        const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
+
+        image_descriptor imageInfo = { 0 };
+        imageInfo.type = CL_MEM_OBJECT_IMAGE2D;
+        imageInfo.width = 4;
+        imageInfo.height = 4;
+        imageInfo.format = &formats;
+
+        BufferOwningPtr<char> imageValues_input, imageValues_output;
+
+        MTdataHolder d(gRandomSeed);
+        generate_random_image_data(&imageInfo, imageValues_input, d);
+        generate_random_image_data(&imageInfo, imageValues_output, d);
+
+        char *host_ptr_input = (char *)imageValues_input;
+        char *host_ptr_output = (char *)imageValues_output;
+        std::vector<char> outputData(data_size);
+
+        clMemWrapper src_image =
+            create_image_2d(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                            &formats, image_desc.image_width,
+                            image_desc.image_height, 0, host_ptr_input, &error);
+        test_error(error, "create_image_2d failed");
+
+        clMemWrapper dst_image = create_image_2d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, image_desc.image_height, 0, host_ptr_output,
+            &error);
+        test_error(error, "create_image_2d failed");
+
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &sample_const_arg_kernel,
+                                            "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clSamplerWrapper sampler = clCreateSampler(
+            context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_image);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        size_t globalDim[3] = { 4, 4, 1 }, localDim[3] = { 1, 1, 1 };
+
+        cl_ndrange_kernel_command_properties_khr props[] = {
+            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
+        };
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, props, kernel, 1, nullptr, globalDim,
+            localDim, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        clMemWrapper new_image = create_image_2d(
+            context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &formats,
+            image_desc.image_width, image_desc.image_height, 0,
+            imageValues_output, &error);
+        test_error(error, "create_image_2d failed");
+
+        cl_mutable_dispatch_arg_khr arg_2{ 2, sizeof(cl_mem), &new_image };
+        cl_mutable_dispatch_arg_khr args[] = { arg_2 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t region[3] = { image_desc.image_width, image_desc.image_height,
+                             1 };
+
+        error = clEnqueueReadImage(queue, new_image, CL_TRUE, origin, region, 0,
+                                   0, outputData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < imageInfo.width * imageInfo.height; ++i)
+        {
+            if (imageValues_input[i] != outputData[i])
+            {
+                log_error("Data failed to verify: imageValues[%d]=%d != "
+                          "outputData[%d]=%d\n",
+                          i, imageValues_input[i], i, outputData[i]);
+                return TEST_FAIL;
+            }
+        }
+
+        return TEST_PASS;
+    }
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_image_1d_arguments(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchImage1DArguments>(device, context,
+                                                           queue, num_elements);
+}
+
+int test_mutable_dispatch_image_2d_arguments(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchImage2DArguments>(device, context,
+                                                           queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
index cc425a4d6..61600dc90 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
@@ -42,13 +42,13 @@
 // CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR
 // CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR
 
-struct InfoDeviceQuery : public BasicMutableCommandBufferTest
+struct InfoDeviceQuery : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoDeviceQuery(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -71,12 +71,12 @@ struct InfoDeviceQuery : public BasicMutableCommandBufferTest
     }
 };
 
-struct InfoBuffer : public BasicMutableCommandBufferTest
+struct InfoBuffer : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoBuffer(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -108,13 +108,13 @@ struct InfoBuffer : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct PropertiesArray : public BasicMutableCommandBufferTest
+struct PropertiesArray : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     PropertiesArray(cl_device_id device, cl_context context,
                     cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -140,7 +140,7 @@ struct PropertiesArray : public BasicMutableCommandBufferTest
         if (size != sizeof(props) || test_props[0] != props[0]
             || test_props[1] != props[1])
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect properties returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -154,12 +154,12 @@ struct PropertiesArray : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Kernel : public BasicMutableCommandBufferTest
+struct Kernel : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Kernel(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -181,7 +181,7 @@ struct Kernel : public BasicMutableCommandBufferTest
         // opaque object.
         if (test_kernel != kernel)
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect kernel returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -195,12 +195,12 @@ struct Kernel : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct Dimensions : public BasicMutableCommandBufferTest
+struct Dimensions : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     Dimensions(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -210,8 +210,7 @@ struct Dimensions : public BasicMutableCommandBufferTest
             &global_work_size, nullptr, 0, nullptr, nullptr, &command);
         test_error(error, "clCommandNDRangeKernelKHR failed");
 
-        size_t test_dimensions;
-
+        cl_uint test_dimensions = 0;
         error = clGetMutableCommandInfoKHR(
             command, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR,
             sizeof(test_dimensions), &test_dimensions, nullptr);
@@ -219,7 +218,7 @@ struct Dimensions : public BasicMutableCommandBufferTest
 
         if (test_dimensions != dimensions)
         {
-            log_error("ERROR: Incorrect command buffer returned from "
+            log_error("ERROR: Incorrect dimensions returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -234,12 +233,12 @@ struct Dimensions : public BasicMutableCommandBufferTest
     const size_t dimensions = 3;
 };
 
-struct InfoType : public BasicMutableCommandBufferTest
+struct InfoType : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoType(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -271,12 +270,12 @@ struct InfoType : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoQueue : public BasicMutableCommandBufferTest
+struct InfoQueue : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoQueue(cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -308,13 +307,13 @@ struct InfoQueue : public BasicMutableCommandBufferTest
     cl_mutable_command_khr command = nullptr;
 };
 
-struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkOffset : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkOffset(cl_device_id device, cl_context context,
                          cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -330,7 +329,7 @@ struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
 
         if (test_global_work_offset != global_work_offset)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong global work offset returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -346,13 +345,13 @@ struct InfoGlobalWorkOffset : public BasicMutableCommandBufferTest
     size_t test_global_work_offset = 0;
 };
 
-struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
+struct InfoGlobalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoGlobalWorkSize(cl_device_id device, cl_context context,
                        cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -368,7 +367,7 @@ struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
 
         if (test_global_work_size != global_work_size)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong global work size returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
@@ -383,13 +382,13 @@ struct InfoGlobalWorkSize : public BasicMutableCommandBufferTest
     size_t test_global_work_size = 0;
 };
 
-struct InfoLocalWorkSize : public BasicMutableCommandBufferTest
+struct InfoLocalWorkSize : public InfoMutableCommandBufferTest
 {
-    using BasicMutableCommandBufferTest::BasicMutableCommandBufferTest;
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
 
     InfoLocalWorkSize(cl_device_id device, cl_context context,
                       cl_command_queue queue)
-        : BasicMutableCommandBufferTest(device, context, queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
     {}
 
     cl_int Run() override
@@ -405,7 +404,7 @@ struct InfoLocalWorkSize : public BasicMutableCommandBufferTest
 
         if (test_local_work_size != local_work_size)
         {
-            log_error("ERROR: Wrong size returned from "
+            log_error("ERROR: Wrong local work size returned from "
                       "clGetMutableCommandInfoKHR.");
             return TEST_FAIL;
         }
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp
new file mode 100644
index 000000000..22a9da6d5
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_local_size.cpp
@@ -0,0 +1,174 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include "typeWrappers.h"
+#include "procs.h"
+#include "testHarness.h"
+#include "mutable_command_basic.h"
+#include <vector>
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+//
+// CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR
+
+struct MutableDispatchLocalSize : public InfoMutableCommandBufferTest
+{
+    using InfoMutableCommandBufferTest::InfoMutableCommandBufferTest;
+
+    MutableDispatchLocalSize(cl_device_id device, cl_context context,
+                             cl_command_queue queue)
+        : InfoMutableCommandBufferTest(device, context, queue)
+    {}
+
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR;
+
+        return !mutable_support || InfoMutableCommandBufferTest::Skip();
+    }
+
+    cl_int Run() override
+    {
+        const char *local_size_kernel =
+            R"(
+                __kernel void sample_test(__global int *dst)
+            {
+                size_t tid = get_global_id(0);
+                dst[tid] = get_local_size(0);
+            })";
+
+        cl_int error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &local_size_kernel, "sample_test");
+        test_error(error, "Creating kernel failed");
+
+        clMemWrapper stream;
+        stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
+                                nullptr, &error);
+        test_error(error, "Creating test array failed");
+
+        /* Set the arguments */
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &stream);
+        test_error(error, "Unable to set indexed kernel arguments");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &global_work_size, &local_work_size, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed.");
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            0 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            nullptr /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            &update_global_size /* global_work_size */,
+            &update_local_size /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(command_buffer, &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clGetMutableCommandInfoKHR(
+            command, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR,
+            sizeof(info_local_size), &info_local_size, nullptr);
+        test_error(error, "clGetMutableCommandInfoKHR failed");
+
+        if (info_local_size != update_local_size)
+        {
+            log_error("ERROR: Wrong size returned from "
+                      "clGetMutableCommandInfoKHR.");
+            return TEST_FAIL;
+        }
+
+        std::vector<cl_int> resultData;
+        resultData.resize(num_elements);
+
+        error = clEnqueueReadBuffer(queue, stream, CL_TRUE, 0, sizeToAllocate,
+                                    resultData.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+            if (i < update_global_size && update_local_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_local_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+            else if (i >= update_global_size
+                     && local_work_size != resultData[i])
+            {
+                log_error("Data failed to verify: update_local_size != "
+                          "resultData[%d]=%d\n",
+                          i, resultData[i]);
+                return TEST_FAIL;
+            }
+
+        return CL_SUCCESS;
+    }
+
+    size_t info_local_size = 0;
+    const size_t global_work_size = 16;
+    const size_t local_work_size = 8;
+    const size_t update_global_size = 8;
+    const size_t update_local_size = 4;
+    const size_t sizeToAllocate = 64;
+    const size_t num_elements = sizeToAllocate / sizeof(cl_int);
+
+    cl_mutable_command_khr command = nullptr;
+};
+
+int test_mutable_dispatch_local_size(cl_device_id device, cl_context context,
+                                     cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<MutableDispatchLocalSize>(device, context, queue,
+                                                    num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp
new file mode 100644
index 000000000..d507dadfa
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_out_of_order.cpp
@@ -0,0 +1,454 @@
+//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <extensionHelpers.h>
+#include <vector>
+#include "mutable_command_basic.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+////////////////////////////////////////////////////////////////////////////////
+// mutable dispatch tests which handle following cases:
+// - simultaneous use
+// - cross-queue simultaneous-use
+
+namespace {
+
+template <bool simultaneous_request>
+struct OutOfOrderTest : public BasicMutableCommandBufferTest
+{
+    OutOfOrderTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue)
+        : BasicMutableCommandBufferTest(device, context, queue),
+          out_of_order_queue(nullptr), out_of_order_command_buffer(this),
+          user_event(nullptr), wait_pass_event(nullptr), kernel_fill(nullptr),
+          program_fill(nullptr)
+    {
+        simultaneous_use_requested = simultaneous_request;
+        if (simultaneous_request) buffer_size_multiplier = 2;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUpKernel() override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUpKernel();
+        test_error(error, "BasicMutableCommandBufferTest::SetUpKernel failed");
+
+        // create additional kernel to properly prepare output buffer for test
+        const char* kernel_str =
+            R"(
+          __kernel void fill(int pattern, __global int* out, __global int*
+        offset)
+          {
+              size_t id = get_global_id(0);
+              size_t ind = offset[0] + id ;
+              out[ind] = pattern;
+          })";
+
+        error = create_single_kernel_helper_create_program(
+            context, &program_fill, 1, &kernel_str);
+        test_error(error, "Failed to create program with source");
+
+        error =
+            clBuildProgram(program_fill, 1, &device, nullptr, nullptr, nullptr);
+        test_error(error, "Failed to build program");
+
+        kernel_fill = clCreateKernel(program_fill, "fill", &error);
+        test_error(error, "Failed to create copy kernel");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUpKernelArgs() override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUpKernelArgs();
+        test_error(error,
+                   "BasicMutableCommandBufferTest::SetUpKernelArgs failed");
+
+        error = clSetKernelArg(kernel_fill, 0, sizeof(cl_int),
+                               &overwritten_pattern);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(kernel_fill, 1, sizeof(out_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(kernel_fill, 2, sizeof(off_mem), &off_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicMutableCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicMutableCommandBufferTest::SetUp failed");
+
+        error = SetUpKernel();
+        test_error(error, "SetUpKernel failed");
+
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error, "Unable to create command queue to test with");
+
+        cl_command_buffer_properties_khr properties[3] = {
+            CL_COMMAND_BUFFER_FLAGS_KHR, CL_COMMAND_BUFFER_MUTABLE_KHR, 0
+        };
+
+        out_of_order_command_buffer = clCreateCommandBufferKHR(
+            1, &out_of_order_queue, properties, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    bool Skip() override
+    {
+        cl_mutable_dispatch_fields_khr mutable_capabilities;
+
+        bool mutable_support =
+            !clGetDeviceInfo(
+                device, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
+                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
+            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;
+
+
+        return !out_of_order_support
+            || (simultaneous_use_requested && !simultaneous_use_support)
+            || !mutable_support || BasicMutableCommandBufferTest::Skip();
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int Run() override
+    {
+        cl_int error = CL_SUCCESS;
+
+        if (simultaneous_use_support)
+        {
+            // enqueue simultaneous command-buffers with out-of-order calls
+            error = RunSimultaneous();
+            test_error(error, "RunSimultaneous failed");
+        }
+        else
+        {
+            // enqueue single command-buffer with out-of-order calls
+            error = RunSingle();
+            test_error(error, "RunSingle failed");
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RecordCommandBuffer()
+    {
+        cl_sync_point_khr sync_points[2];
+        const cl_int pattern = pattern_pri;
+        cl_int error =
+            clCommandFillBufferKHR(out_of_order_command_buffer, nullptr, in_mem,
+                                   &pattern, sizeof(cl_int), 0, data_size(), 0,
+                                   nullptr, &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandFillBufferKHR(out_of_order_command_buffer, nullptr,
+                                       out_mem, &overwritten_pattern,
+                                       sizeof(cl_int), 0, data_size(), 0,
+                                       nullptr, &sync_points[1], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RunSingle()
+    {
+        cl_int error;
+
+        error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &single_event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 1,
+                                    &single_event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
+        }
+
+        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                                  sizeof(cl_int) * num_elements
+                                                      * buffer_size_multiplier,
+                                                  nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
+                                           &new_out_mem };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(out_of_order_command_buffer,
+                                           &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &single_event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, new_out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 1,
+                                    &single_event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RecordSimultaneousCommandBuffer()
+    {
+        cl_sync_point_khr sync_points[2];
+        // for both simultaneous passes this call will fill entire in_mem buffer
+        cl_int error = clCommandFillBufferKHR(
+            out_of_order_command_buffer, nullptr, in_mem, &pattern_pri,
+            sizeof(cl_int), 0, data_size() * buffer_size_multiplier, 0, nullptr,
+            &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        // to avoid overwriting the entire result buffer instead of filling
+        // only relevant part this additional kernel was introduced
+
+        error = clCommandNDRangeKernelKHR(out_of_order_command_buffer, nullptr,
+                                          nullptr, kernel_fill, 1, nullptr,
+                                          &num_elements, nullptr, 0, nullptr,
+                                          &sync_points[1], &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    struct SimulPassData
+    {
+        cl_int offset;
+        std::vector<cl_int> output_buffer;
+        // 0:user event, 1:offset-buffer fill event, 2:kernel done event
+        clEventWrapper wait_events[3];
+    };
+
+    //--------------------------------------------------------------------------
+    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    {
+        cl_int error = CL_SUCCESS;
+        if (!user_event)
+        {
+            user_event = clCreateUserEvent(context, &error);
+            test_error(error, "clCreateUserEvent failed");
+        }
+
+        pd.wait_events[0] = user_event;
+
+        // filling offset buffer must wait for previous pass completeness
+        error = clEnqueueFillBuffer(
+            out_of_order_queue, off_mem, &pd.offset, sizeof(cl_int), 0,
+            sizeof(cl_int), (wait_pass_event != nullptr ? 1 : 0),
+            (wait_pass_event != nullptr ? &wait_pass_event : nullptr),
+            &pd.wait_events[1]);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        // command buffer execution must wait for two wait-events
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 2, &pd.wait_events[0],
+            &pd.wait_events[2]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_FALSE,
+                                    pd.offset * sizeof(cl_int), data_size(),
+                                    pd.output_buffer.data(), 1,
+                                    &pd.wait_events[2], nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                                  sizeof(cl_int) * num_elements
+                                                      * buffer_size_multiplier,
+                                                  nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
+                                           &new_out_mem };
+        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
+
+        cl_mutable_dispatch_config_khr dispatch_config{
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
+            nullptr,
+            command,
+            1 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+        cl_mutable_base_config_khr mutable_config{
+            CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1,
+            &dispatch_config
+        };
+
+        error = clUpdateMutableCommandsKHR(out_of_order_command_buffer,
+                                           &mutable_config);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        // command buffer execution must wait for two wait-events
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 2, &pd.wait_events[0],
+            &pd.wait_events[2]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(out_of_order_queue, new_out_mem, CL_FALSE,
+                                    pd.offset * sizeof(cl_int), data_size(),
+                                    pd.output_buffer.data(), 1,
+                                    &pd.wait_events[2], nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    cl_int RunSimultaneous()
+    {
+        cl_int error = RecordSimultaneousCommandBuffer();
+        test_error(error, "RecordSimultaneousCommandBuffer failed");
+
+        cl_int offset = static_cast<cl_int>(num_elements);
+
+        std::vector<SimulPassData> simul_passes = {
+            { 0, std::vector<cl_int>(num_elements) },
+            { offset, std::vector<cl_int>(num_elements) }
+        };
+
+        for (auto&& pass : simul_passes)
+        {
+            error = EnqueueSimultaneousPass(pass);
+            test_error(error, "EnqueueSimultaneousPass failed");
+
+            wait_pass_event = pass.wait_events[2];
+        }
+
+        error = clSetUserEventStatus(user_event, CL_COMPLETE);
+        test_error(error, "clSetUserEventStatus failed");
+
+        error = clFinish(out_of_order_queue);
+        test_error(error, "clFinish failed");
+
+        // verify the result buffers
+        for (auto&& pass : simul_passes)
+        {
+            auto& res_data = pass.output_buffer;
+            for (size_t i = 0; i < num_elements; i++)
+            {
+                CHECK_VERIFICATION_ERROR(pattern_pri, res_data[i], i);
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+
+    //--------------------------------------------------------------------------
+    clCommandQueueWrapper out_of_order_queue;
+    clCommandBufferWrapper out_of_order_command_buffer;
+
+    clEventWrapper user_event;
+    clEventWrapper single_event;
+    clEventWrapper wait_pass_event;
+
+    clKernelWrapper kernel_fill;
+    clProgramWrapper program_fill;
+
+    const size_t test_global_work_size = 3 * sizeof(cl_int);
+    cl_mutable_command_khr command = nullptr;
+
+    const cl_int overwritten_pattern = 0xACDC;
+    const cl_int pattern_pri = 42;
+};
+
+} // anonymous namespace
+
+int test_mutable_dispatch_out_of_order(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<OutOfOrderTest<false>>(device, context, queue,
+                                                 num_elements);
+}
+
+int test_mutable_dispatch_simultaneous_out_of_order(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements)
+{
+    return MakeAndRunTest<OutOfOrderTest<true>>(device, context, queue,
+                                                num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
index 4b6dacb69..1db48917f 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/procs.h
@@ -59,4 +59,51 @@ extern int test_mutable_command_info_global_work_size(cl_device_id device,
                                                       cl_context context,
                                                       cl_command_queue queue,
                                                       int num_elements);
-#endif // CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H
+extern int test_mutable_dispatch_image_1d_arguments(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_mutable_dispatch_image_2d_arguments(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_mutable_dispatch_global_arguments(cl_device_id device,
+                                                  cl_context context,
+                                                  cl_command_queue queue,
+                                                  int num_elements);
+extern int test_mutable_dispatch_local_arguments(cl_device_id device,
+                                                 cl_context context,
+                                                 cl_command_queue queue,
+                                                 int num_elements);
+extern int test_mutable_dispatch_pod_arguments(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+extern int test_mutable_dispatch_null_arguments(cl_device_id device,
+                                                cl_context context,
+                                                cl_command_queue queue,
+                                                int num_elements);
+extern int test_mutable_dispatch_svm_arguments(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+extern int test_mutable_dispatch_out_of_order(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements);
+extern int test_mutable_dispatch_simultaneous_out_of_order(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_mutable_dispatch_global_size(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_mutable_dispatch_local_size(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue,
+                                            int num_elements);
+extern int test_mutable_dispatch_global_offset(cl_device_id device,
+                                               cl_context context,
+                                               cl_command_queue queue,
+                                               int num_elements);
+#endif /*_CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_PROCS_H*/
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp
new file mode 100644
index 000000000..bd669165c
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_finalize.cpp
@@ -0,0 +1,85 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "basic_command_buffer.h"
+#include "procs.h"
+
+namespace {
+
+// Test that finalizing a command-buffer that has already been finalized returns
+// the correct error code.
+struct FinalizeInvalid : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        // Finalizing an already finalized command-buffer must return
+        // CL_INVALID_OPERATION
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_failure_error_ret(
+            error, CL_INVALID_OPERATION,
+            "clFinalizeCommandBufferKHR should return CL_INVALID_OPERATION",
+            TEST_FAIL);
+
+        return CL_SUCCESS;
+    }
+};
+
+// Check that an empty command-buffer can be finalized and then executed.
+struct FinalizeEmpty : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        // Finalize an empty command-buffer
+        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        // Execute empty command-buffer and then wait to complete
+        clEventWrapper event;
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clWaitForEvents(1, &event);
+        test_error(error, "clWaitForEvents failed");
+
+        return CL_SUCCESS;
+    }
+};
+} // anonymous namespace
+
+int test_finalize_invalid(cl_device_id device, cl_context context,
+                          cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<FinalizeInvalid>(device, context, queue,
+                                           num_elements);
+}
+
+int test_finalize_empty(cl_device_id device, cl_context context,
+                        cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<FinalizeEmpty>(device, context, queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
index d46b28887..2ad77dbe0 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
@@ -26,6 +26,7 @@ enum class CombufInfoTestMode
     CITM_REF_COUNT,
     CITM_STATE,
     CITM_PROP_ARRAY,
+    CITM_CONTEXT,
 };
 
 namespace {
@@ -38,6 +39,7 @@ namespace {
 // -test case for CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR query
 // -test case for CL_COMMAND_BUFFER_STATE_KHR query
 // -test case for CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR query
+// -test case for CL_COMMAND_BUFFER_CONTEXT_KHR query
 
 template <CombufInfoTestMode test_mode>
 struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
@@ -70,6 +72,10 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
                 error = RunPropArrayInfoTest();
                 test_error(error, "RunPropArrayInfoTest failed");
                 break;
+            case CombufInfoTestMode::CITM_CONTEXT:
+                error = RunContextInfoTest();
+                test_error(error, "RunContextInfoTest failed");
+                break;
         }
 
         return CL_SUCCESS;
@@ -205,8 +211,7 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
 
         // lambda to verify given state
         auto verify_state = [&](const cl_command_buffer_state_khr &expected) {
-            cl_command_buffer_state_khr state =
-                CL_COMMAND_BUFFER_STATE_INVALID_KHR;
+            cl_command_buffer_state_khr state = ~cl_command_buffer_state_khr(0);
 
             cl_int error = clGetCommandBufferInfoKHR(
                 command_buffer, CL_COMMAND_BUFFER_STATE_KHR, sizeof(state),
@@ -323,6 +328,46 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
         return TEST_FAIL;
     }
 
+    cl_int RunContextInfoTest()
+    {
+        cl_int error = TEST_PASS;
+
+        // record command buffers
+        error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
+        size_t ret_value_size = 0;
+        error = clGetCommandBufferInfoKHR(command_buffer,
+                                          CL_COMMAND_BUFFER_CONTEXT_KHR, 0,
+                                          nullptr, &ret_value_size);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+
+        test_assert_error(
+            ret_value_size == sizeof(cl_context),
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context ret_context = nullptr;
+        error = clGetCommandBufferInfoKHR(
+            command_buffer, CL_COMMAND_BUFFER_CONTEXT_KHR, sizeof(cl_context),
+            &ret_context, nullptr);
+        test_error(error, "clGetCommandBufferInfoKHR failed");
+        test_assert_error(
+            ret_context != nullptr,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        cl_context expected_context = nullptr;
+        error =
+            clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context),
+                                  &expected_context, nullptr);
+        test_error(error, "clGetCommandQueueInfo failed");
+
+        test_assert_error(
+            ret_context == expected_context,
+            "Unexpected result of CL_COMMAND_BUFFER_CONTEXT_KHR query!");
+
+        return TEST_PASS;
+    }
+
     const cl_int pattern = 0xE;
 };
 
@@ -360,3 +405,11 @@ int test_info_prop_array(cl_device_id device, cl_context context,
         CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_PROP_ARRAY>>(
         device, context, queue, num_elements);
 }
+
+int test_info_context(cl_device_id device, cl_context context,
+                      cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<
+        CommandBufferGetCommandBufferInfo<CombufInfoTestMode::CITM_CONTEXT>>(
+        device, context, queue, num_elements);
+}
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
index d73fc9ce7..82ff16f0e 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_barrier.cpp
@@ -70,15 +70,42 @@ struct BarrierWithWaitListKHR : public BasicCommandBufferTest
             0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_int> output_data(num_elements);
+        std::vector<cl_int> output_data_1(num_elements);
         error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
-                                    data_size(), output_data.data(), 1, &event,
-                                    nullptr);
+                                    data_size(), output_data_1.data(), 1,
+                                    &event, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < num_elements; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error =
+            clEnqueueFillBuffer(queue, in_mem, &zero_pattern, sizeof(cl_int), 0,
+                                data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error =
+            clEnqueueFillBuffer(queue, out_mem, &zero_pattern, sizeof(cl_int),
+                                0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data_2(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data_2.data(), 1,
+                                    &event, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -106,6 +133,7 @@ struct BarrierWithWaitListKHR : public BasicCommandBufferTest
     }
 
     const cl_int pattern = 0x16;
+    const cl_int zero_pattern = 0x0;
     clCommandQueueWrapper out_of_order_queue;
     clCommandBufferWrapper out_of_order_command_buffer;
     clEventWrapper event;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
index 102ae761e..7a1f0e6d5 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_copy.cpp
@@ -38,7 +38,7 @@ struct CopyImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillImageKHR(command_buffer, nullptr, src_image,
-                                             fill_color, origin, region, 0,
+                                             fill_color_1, origin, region, 0,
                                              nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -56,13 +56,38 @@ struct CopyImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
-        error = clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0,
-                                   0, output_data.data(), 0, nullptr, nullptr);
+        std::vector<cl_char> output_data_1(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, src_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueFillImage(queue, dst_image, fill_color_2, origin,
+                                   region, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImageKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error =
+            clEnqueueReadImage(queue, dst_image, CL_TRUE, origin, region, 0, 0,
+                               output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -97,8 +122,12 @@ struct CopyImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x05;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x05;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x1;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
     clMemWrapper src_image;
     clMemWrapper dst_image;
@@ -111,7 +140,7 @@ struct CopyBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -127,20 +156,45 @@ struct CopyBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const cl_char pattern = 0x14;
+    const cl_char pattern_1 = 0x14;
+    const cl_char pattern_2 = 0x28;
 };
 
 struct CopyBufferToImageKHR : public BasicCommandBufferTest
@@ -150,7 +204,7 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, buffer, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, buffer, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -168,15 +222,40 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadImage failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillImage(queue, image, &fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadImage failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -211,7 +290,14 @@ struct CopyBufferToImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x11;
+    const cl_char pattern_1 = 0x11;
+    const cl_char pattern_2 = 0x22;
+
+    const cl_uint fill_color_2[4] = { static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2),
+                                      static_cast<cl_uint>(pattern_2) };
+
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper buffer;
@@ -225,7 +311,7 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -243,16 +329,39 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
 
         error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_1.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueFillBuffer(queue, buffer, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+
+        error = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, data_size,
+                                    output_data_2.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -287,8 +396,12 @@ struct CopyImageToBufferKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x12;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x12;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x24;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -302,7 +415,7 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size, 0, nullptr, nullptr, nullptr);
         test_error(error, "clCommandFillBufferKHR failed");
 
@@ -319,14 +432,38 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillBuffer(queue, out_mem, &pattern_2, sizeof(cl_char),
+                                    0, data_size, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
         error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size,
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -353,7 +490,8 @@ struct CopyBufferRectKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_char pattern = 0x13;
+    const cl_char pattern_1 = 0x13;
+    const cl_char pattern_2 = 0x26;
 
     clMemWrapper in_mem;
     clMemWrapper out_mem;
diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
index 88e97a271..0ba8055a1 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_fill.cpp
@@ -35,7 +35,7 @@ struct FillImageKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error =
-            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color,
+            clCommandFillImageKHR(command_buffer, nullptr, image, fill_color_1,
                                   origin, region, 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillImageKHR failed");
@@ -47,14 +47,34 @@ struct FillImageKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size);
+        std::vector<cl_char> output_data_1(data_size);
         error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
-                                   output_data.data(), 0, nullptr, nullptr);
+                                   output_data_1.data(), 0, nullptr, nullptr);
 
         for (size_t i = 0; i < data_size; i++)
         {
-            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern),
-                                     output_data[i], i);
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        error = clEnqueueFillImage(queue, image, fill_color_2, origin, region,
+                                   0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillImage failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size);
+        error = clEnqueueReadImage(queue, image, CL_TRUE, origin, region, 0, 0,
+                                   output_data_2.data(), 0, nullptr, nullptr);
+
+        for (size_t i = 0; i < data_size; i++)
+        {
+            CHECK_VERIFICATION_ERROR(static_cast<cl_char>(pattern_1),
+                                     output_data_2[i], i);
         }
 
         return CL_SUCCESS;
@@ -85,8 +105,12 @@ struct FillImageKHR : public BasicCommandBufferTest
     const size_t data_size = img_width * img_height * 4 * sizeof(cl_char);
     const size_t origin[3] = { 0, 0, 0 },
                  region[3] = { img_width, img_height, 1 };
-    const cl_uint pattern = 0x10;
-    const cl_uint fill_color[4] = { pattern, pattern, pattern, pattern };
+    const cl_uint pattern_1 = 0x10;
+    const cl_uint fill_color_1[4] = { pattern_1, pattern_1, pattern_1,
+                                      pattern_1 };
+    const cl_uint pattern_2 = 0x20;
+    const cl_uint fill_color_2[4] = { pattern_2, pattern_2, pattern_2,
+                                      pattern_2 };
     const cl_image_format formats = { CL_RGBA, CL_UNSIGNED_INT8 };
 
     clMemWrapper image;
@@ -99,7 +123,7 @@ struct FillBufferKHR : public BasicCommandBufferTest
     cl_int Run() override
     {
         cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, in_mem, &pattern, sizeof(cl_char), 0,
+            command_buffer, nullptr, in_mem, &pattern_1, sizeof(cl_char), 0,
             data_size(), 0, nullptr, nullptr, nullptr);
 
         test_error(error, "clCommandFillBufferKHR failed");
@@ -111,20 +135,40 @@ struct FillBufferKHR : public BasicCommandBufferTest
                                           nullptr, nullptr);
         test_error(error, "clEnqueueCommandBufferKHR failed");
 
-        std::vector<cl_char> output_data(data_size());
+        std::vector<cl_char> output_data_1(data_size());
+        error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
+                                    output_data_1.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < data_size(); i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_1[i], i);
+        }
+
+        /* Check second enqueue of command buffer */
+
+        clEnqueueFillBuffer(queue, in_mem, &pattern_2, sizeof(cl_char), 0,
+                            data_size(), 0, nullptr, nullptr);
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_char> output_data_2(data_size());
         error = clEnqueueReadBuffer(queue, in_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
+                                    output_data_2.data(), 0, nullptr, nullptr);
         test_error(error, "clEnqueueReadBuffer failed");
 
         for (size_t i = 0; i < data_size(); i++)
         {
-            CHECK_VERIFICATION_ERROR(pattern, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_1, output_data_2[i], i);
         }
 
         return CL_SUCCESS;
     }
 
-    const char pattern = 0x15;
+    const char pattern_1 = 0x15;
+    const char pattern_2 = 0x30;
 };
 
 };
diff --git a/test_conformance/extensions/cl_khr_command_buffer/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
index 4eefc8ab1..3e923f6cd 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
@@ -26,6 +26,7 @@ test_definition test_list[] = {
     ADD_TEST(info_ref_count),
     ADD_TEST(info_state),
     ADD_TEST(info_prop_array),
+    ADD_TEST(info_context),
     ADD_TEST(basic_profiling),
     ADD_TEST(simultaneous_profiling),
     ADD_TEST(regular_wait_for_command_buffer),
@@ -58,7 +59,9 @@ test_definition test_list[] = {
     ADD_TEST(event_info_command_queue),
     ADD_TEST(event_info_execution_status),
     ADD_TEST(event_info_context),
-    ADD_TEST(event_info_reference_count)
+    ADD_TEST(event_info_reference_count),
+    ADD_TEST(finalize_invalid),
+    ADD_TEST(finalize_empty)
 };
 
 int main(int argc, const char *argv[])
diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h
index 53a7d9349..cd839cbb0 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -41,6 +41,8 @@ extern int test_info_state(cl_device_id device, cl_context context,
                            cl_command_queue queue, int num_elements);
 extern int test_info_prop_array(cl_device_id device, cl_context context,
                                 cl_command_queue queue, int num_elements);
+extern int test_info_context(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements);
 extern int test_basic_set_kernel_arg(cl_device_id device, cl_context context,
                                      cl_command_queue queue, int num_elements);
 extern int test_pending_set_kernel_arg(cl_device_id device, cl_context context,
@@ -130,5 +132,9 @@ extern int test_event_info_reference_count(cl_device_id device,
                                            cl_context context,
                                            cl_command_queue queue,
                                            int num_elements);
+extern int test_finalize_invalid(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+extern int test_finalize_empty(cl_device_id device, cl_context context,
+                               cl_command_queue queue, int num_elements);
 
 #endif // CL_KHR_COMMAND_BUFFER_PROCS_H
diff --git a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
index a7ed307ee..89ab17b38 100644
--- a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
@@ -120,9 +120,11 @@ int test_external_semaphores_queries(cl_device_id deviceID, cl_context context,
     SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_TYPE_KHR, cl_semaphore_type_khr,
                          CL_SEMAPHORE_TYPE_BINARY_KHR);
 
-    SEMAPHORE_PARAM_TEST(CL_DEVICE_HANDLE_LIST_KHR, cl_uint, 1);
+    SEMAPHORE_PARAM_TEST(CL_DEVICE_HANDLE_LIST_KHR, cl_device_id, deviceID);
 
-    SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, cl_uint, 1);
+    SEMAPHORE_PARAM_TEST(
+        CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, cl_uint,
+        getCLSemaphoreTypeFromVulkanType(vkExternalSemaphoreHandleType));
 
     // Confirm that querying CL_SEMAPHORE_CONTEXT_KHR returns the right context
     SEMAPHORE_PARAM_TEST(CL_SEMAPHORE_CONTEXT_KHR, cl_context, context);
@@ -290,7 +292,7 @@ static int semaphore_external_cross_queue_helper(cl_device_id deviceID,
                                      nullptr, 0, nullptr, &wait_event);
     test_error(err, "Could not wait semaphore");
 
-    // Finish queue_1 and queue_2
+    // Finish queue_1 and queue_2
     err = clFinish(queue_1);
     test_error(err, "Could not finish queue");
 
@@ -304,7 +306,7 @@ static int semaphore_external_cross_queue_helper(cl_device_id deviceID,
     return TEST_PASS;
 }
 
-// Confirm that a signal followed by a wait will complete successfully
+// Confirm that a signal followed by a wait will complete successfully
 int test_external_semaphores_simple_1(cl_device_id deviceID, cl_context context,
                                       cl_command_queue defaultQueue,
                                       int num_elements)
@@ -931,420 +933,3 @@ int test_external_semaphores_multi_wait(cl_device_id deviceID,
 
     return TEST_PASS;
 }
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first one deferred wait, then one non deferred signal.
-int test_external_semaphores_order_1(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user event
-    clEventWrapper user_event = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Wait semaphore (dependency on user_event)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore
-    clEventWrapper signal_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 0, nullptr, &signal_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure signal event is completed while wait event is not
-    test_assert_event_complete(signal_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event
-    err = clSetUserEventStatus(user_event, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_event);
-    test_assert_event_complete(wait_event);
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then one deferred wait. Unblock
-// signal, then unblock wait. When wait completes, unblock the other signal.
-int test_external_semaphores_order_2(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_2,
-                                       &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_3, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are completed except for second signal
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_event);
-
-    return TEST_PASS;
-}
-
-// Confirm that it is possible to enqueue a signal of wait and signal in any
-// order as soon as the submission order (after deferred dependencies) is
-// correct. Case: first two deferred signals, then two deferred waits. Unblock
-// one signal and one wait (both blocked by the same user event). When wait
-// completes, unblock the other signal. Then unblock the other wait.
-int test_external_semaphores_order_3(cl_device_id deviceID, cl_context context,
-                                     cl_command_queue defaultQueue,
-                                     int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                 vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_3 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Signal semaphore (dependency on user_event_2)
-    clEventWrapper signal_2_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                       nullptr, 1, &user_event_2,
-                                       &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore (dependency on user_event_3)
-    clEventWrapper wait_1_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_3, &wait_1_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Wait semaphore (dependency on user_event_2)
-    clEventWrapper wait_2_event;
-    err = clEnqueueWaitSemaphoresKHR(queue, 1, &sema_ext.getCLSemaphore(),
-                                     nullptr, 1, &user_event_2, &wait_2_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Complete user_event_2
-    err = clSetUserEventStatus(user_event_2, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure only second signal and second wait completed
-    cl_event event_list[] = { signal_2_event, wait_2_event };
-    err = clWaitForEvents(2, event_list);
-    test_error(err, "Could not wait for events");
-
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(wait_1_event);
-
-    // Complete user_event_1
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Complete user_event_3
-    err = clSetUserEventStatus(user_event_3, CL_COMPLETE);
-    test_error(err, "Could not set user event to CL_COMPLETE");
-
-    // Finish
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    // Ensure all events are completed
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_complete(signal_2_event);
-    test_assert_event_complete(wait_1_event);
-    test_assert_event_complete(wait_2_event);
-
-    return TEST_PASS;
-}
-
-// Test that an invalid semaphore command results in the invalidation of the
-// command's event and the dependencies' events
-int test_external_semaphores_invalid_command(cl_device_id deviceID,
-                                             cl_context context,
-                                             cl_command_queue defaultQueue,
-                                             int num_elements)
-{
-    if (!is_extension_available(deviceID, "cl_khr_external_semaphore"))
-    {
-        log_info("cl_khr_semaphore is not supported on this platoform. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    if (init_vuikan_device())
-    {
-        log_info("Cannot initialise Vulkan. "
-                 "Skipping test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-
-    VulkanDevice vkDevice;
-
-    // Obtain pointers to semaphore's API
-    GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
-    GET_PFN(deviceID, clEnqueueWaitSemaphoresKHR);
-
-    const std::vector<VulkanExternalMemoryHandleType>
-        vkExternalMemoryHandleTypeList =
-            getSupportedVulkanExternalMemoryHandleTypeList();
-    VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType =
-        getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
-    VulkanSemaphore vkVk2CLSemaphore1(vkDevice, vkExternalSemaphoreHandleType);
-    VulkanSemaphore vkVk2CLSemaphore2(vkDevice, vkExternalSemaphoreHandleType);
-
-    clExternalSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
-                                   vkExternalSemaphoreHandleType, deviceID);
-    clExternalSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
-                                   vkExternalSemaphoreHandleType, deviceID);
-
-    cl_int err = CL_SUCCESS;
-
-    // Create ooo queue
-    clCommandQueueWrapper queue = clCreateCommandQueue(
-        context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    test_error(err, "Could not create command queue");
-
-    // Create user events
-    clEventWrapper user_event_1 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    clEventWrapper user_event_2 = clCreateUserEvent(context, &err);
-    test_error(err, "Could not create user event");
-
-    // Signal semaphore_1 (dependency on user_event_1)
-    clEventWrapper signal_1_event;
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext_1.getCLSemaphore(),
-                                       nullptr, 1, &user_event_1,
-                                       &signal_1_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Wait semaphore_1 and semaphore_2 (dependency on user_event_1)
-    clEventWrapper wait_event;
-    cl_semaphore_khr sema_list[] = { sema_ext_1.getCLSemaphore(),
-                                     sema_ext_2.getCLSemaphore() };
-    err = clEnqueueWaitSemaphoresKHR(queue, 2, sema_list, nullptr, 1,
-                                     &user_event_1, &wait_event);
-    test_error(err, "Could not wait semaphore");
-
-    // Signal semaphore_1 (dependency on wait_event and user_event_2)
-    clEventWrapper signal_2_event;
-    cl_event wait_list[] = { user_event_2, wait_event };
-    err = clEnqueueSignalSemaphoresKHR(queue, 1, &sema_ext_1.getCLSemaphore(),
-                                       nullptr, 2, wait_list, &signal_2_event);
-    test_error(err, "Could not signal semaphore");
-
-    // Flush and delay
-    err = clFlush(queue);
-    test_error(err, "Could not flush queue");
-    std::this_thread::sleep_for(std::chrono::seconds(FLUSH_DELAY_S));
-
-    // Ensure all events are not completed
-    test_assert_event_inprogress(signal_1_event);
-    test_assert_event_inprogress(signal_2_event);
-    test_assert_event_inprogress(wait_event);
-
-    // Complete user_event_1 (expect failure as waiting on semaphore_2 is not
-    // allowed (unsignaled)
-    err = clSetUserEventStatus(user_event_1, CL_COMPLETE);
-    test_assert_error(err != CL_SUCCESS,
-                      "signal_2_event completed unexpectedly");
-
-    // Ensure signal_1 is completed while others failed (the second signal
-    // should fail as it depends on wait)
-    err = clFinish(queue);
-    test_error(err, "Could not finish queue");
-
-    test_assert_event_complete(signal_1_event);
-    test_assert_event_terminated(wait_event);
-    test_assert_event_terminated(signal_2_event);
-
-    return TEST_PASS;
-}
diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
index 56d15808d..887c9dca7 100644
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
@@ -48,8 +48,10 @@ static inline size_t get_format_size(cl_context context,
     cl_image_desc image_desc = { 0 };
     image_desc.image_type = imageType;
 
-    /* Size 1 only to query element size */
-    image_desc.image_width = 1;
+    /* We use a width of 4 to query element size, as this is
+       the smallest possible value that satisfies the requirements
+       of all image formats (including extensions). */
+    image_desc.image_width = 4;
     if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType
         && CL_MEM_OBJECT_IMAGE1D != imageType)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 74c5a160a..8d4234087 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -189,12 +189,11 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         // Get that moving
         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        FPU_mode_type oldMode;
+        FPU_mode_type oldMode = 0;
         RoundingMode oldRoundMode = kRoundToNearestEven;
         if (isFract)
         {
             // Calculate the correctly rounded reference result
-            memset(&oldMode, 0, sizeof(oldMode));
             if (ftz || relaxedMode) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
diff --git a/test_conformance/relationals/test_comparisons_fp.cpp b/test_conformance/relationals/test_comparisons_fp.cpp
index c3d8f67a3..73ff3dd9e 100644
--- a/test_conformance/relationals/test_comparisons_fp.cpp
+++ b/test_conformance/relationals/test_comparisons_fp.cpp
@@ -22,6 +22,8 @@
 #include <stdexcept>
 #include <vector>
 
+#include "harness/stringHelpers.h"
+
 #include <CL/cl_half.h>
 
 #include "test_comparisons_fp.h"
@@ -83,29 +85,6 @@ extension,
 // clang-format on
 
 
-std::string concat_kernel(const char* sstr[], int num)
-{
-    std::string res;
-    for (int i = 0; i < num; i++) res += std::string(sstr[i]);
-    return res;
-}
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args)
-{
-    int size_s = std::snprintf(nullptr, 0, format.c_str(), args...)
-        + 1; // Extra space for '\0'
-    if (size_s <= 0)
-    {
-        throw std::runtime_error("Error during formatting.");
-    }
-    auto size = static_cast<size_t>(size_s);
-    std::unique_ptr<char[]> buf(new char[size]);
-    std::snprintf(buf.get(), size, format.c_str(), args...);
-    return std::string(buf.get(),
-                       buf.get() + size - 1); // We don't want the '\0' inside
-}
-
 template <typename T, typename F> bool verify(const T& A, const T& B)
 {
     return F()(A, B);
@@ -226,14 +205,14 @@ int RelationalsFPTest::test_equiv_kernel(unsigned int vecSize,
             auto str =
                 concat_kernel(equivTestKerPat_3,
                               sizeof(equivTestKerPat_3) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKerPatLessGreater_3,
                                      sizeof(equivTestKerPatLessGreater_3)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
     else
@@ -243,14 +222,14 @@ int RelationalsFPTest::test_equiv_kernel(unsigned int vecSize,
             auto str =
                 concat_kernel(equivTestKernPat,
                               sizeof(equivTestKernPat) / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str(), opName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str(), opName.c_str());
         }
         else
         {
             auto str = concat_kernel(equivTestKernPatLessGreater,
                                      sizeof(equivTestKernPatLessGreater)
                                          / sizeof(const char*));
-            kernelSource = string_format(str, fnName.c_str());
+            kernelSource = str_sprintf(str, fnName.c_str());
         }
     }
 
diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp
index b0cda09fd..8a0567c34 100644
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -14,11 +14,14 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/typeWrappers.h"
 
 #include <assert.h>
 #include <stdio.h>
 #include <time.h>
 #include <string.h>
+#include <vector>
+
 #if ! defined( _WIN32)
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
@@ -66,6 +69,16 @@ static void printUsage( void );
 #define BUFFER_SIZE (1024*1024)
 #define KPAGESIZE 4096
 
+#define test_error_count(errCode, msg)                                         \
+    {                                                                          \
+        auto errCodeResult = errCode;                                          \
+        if (errCodeResult != CL_SUCCESS)                                       \
+        {                                                                      \
+            gFailCount++;                                                      \
+            print_error(errCodeResult, msg);                                   \
+            return errCode;                                                    \
+        }                                                                      \
+    }
 
 // When we indicate non wimpy mode, the types that are 32 bits value will
 // test their entire range and 64 bits test will test the 32 bit
@@ -74,12 +87,6 @@ static void printUsage( void );
 static bool  s_wimpy_mode = false;
 static int s_wimpy_reduction_factor = 256;
 
-// Tests are broken into the major test which is based on the
-// src and cmp type and their corresponding vector types and
-// sub tests which is for each individual test.  The following
-// tracks the subtests
-int s_test_cnt = 0;
-
 //-----------------------------------------
 // Static helper functions
 //-----------------------------------------
@@ -237,6 +244,9 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont
     if (srctype == kdouble)
         strcpy( extension, "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" );
 
+    if (srctype == khalf)
+        strcpy(extension, "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     // create type name and testname
     switch( vec_len )
     {
@@ -288,25 +298,14 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont
     return program;
 }
 
-
 #define VECTOR_SIZE_COUNT   6
 
 static int doTest(cl_command_queue queue, cl_context context, Type stype, Type cmptype, cl_device_id device)
 {
     int err = CL_SUCCESS;
-    int s_test_fail = 0;
-    MTdataHolder d;
+    MTdataHolder d(gRandomSeed);
     const size_t element_count[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
-    cl_mem src1 = NULL;
-    cl_mem src2 = NULL;
-    cl_mem cmp = NULL;
-    cl_mem dest = NULL;
-    void *ref = NULL;
-    void *sref = NULL;
-    void *src1_host = NULL;
-    void *src2_host = NULL;
-    void *cmp_host = NULL;
-    void *dest_host = NULL;
+    clMemWrapper src1, src2, cmp, dest;
 
     cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
     size_t block_elements = BUFFER_SIZE / type_size[stype];
@@ -315,16 +314,22 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
 
     // It is more efficient to create the tests all at once since we
     // use the same test data on each of the vector sizes
-    int vecsize;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel  kernels[VECTOR_SIZE_COUNT];
+    clProgramWrapper programs[VECTOR_SIZE_COUNT];
+    clKernelWrapper kernels[VECTOR_SIZE_COUNT];
 
-    if(stype == kdouble && ! is_extension_available( device, "cl_khr_fp64" ))
+    if (stype == kdouble && !is_extension_available(device, "cl_khr_fp64"))
     {
         log_info("Skipping double because cl_khr_fp64 extension is not supported.\n");
         return 0;
     }
 
+    if (stype == khalf && !is_extension_available(device, "cl_khr_fp16"))
+    {
+        log_info(
+            "Skipping half because cl_khr_fp16 extension is not supported.\n");
+        return 0;
+    }
+
     if (gIsEmbedded)
     {
        if (( stype == klong || stype == kulong ) && ! is_extension_available( device, "cles_khr_int64" ))
@@ -340,54 +345,41 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
        }
     }
 
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
-    {
-        programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype, cmptype, element_count[vecsize] );
-        if (!programs[vecsize] || !kernels[vecsize]) {
-            ++s_test_fail;
-            ++s_test_cnt;
-            return -1;
-        }
-    }
-
-    ref = malloc( BUFFER_SIZE );
-    if( NULL == ref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
-    sref = malloc( BUFFER_SIZE );
-    if( NULL == sref ){ log_error("Error: could not allocate ref buffer\n" ); goto exit; }
     src1 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src1 buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate src1 buffer\n");
     src2 = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate src2 buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate src2 buffer\n");
     cmp = clCreateBuffer( context, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate cmp buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate cmp buffer\n");
     dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
-    if( err ) { log_error( "Error: could not allocate dest buffer\n" );  ++s_test_fail; goto exit; }
+    test_error_count(err, "Error: could not allocate dest buffer\n");
 
-    src1_host = malloc(BUFFER_SIZE);
-    if (NULL == src1_host)
-    {
-        log_error("Error: could not allocate src1_host buffer\n");
-        goto exit;
-    }
-    src2_host = malloc(BUFFER_SIZE);
-    if (NULL == src2_host)
-    {
-        log_error("Error: could not allocate src2_host buffer\n");
-        goto exit;
-    }
-    cmp_host = malloc(BUFFER_SIZE);
-    if (NULL == cmp_host)
+    for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
     {
-        log_error("Error: could not allocate cmp_host buffer\n");
-        goto exit;
-    }
-    dest_host = malloc(BUFFER_SIZE);
-    if (NULL == dest_host)
-    {
-        log_error("Error: could not allocate dest_host buffer\n");
-        goto exit;
+        programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype,
+                                              cmptype, element_count[vecsize]);
+        if (!programs[vecsize] || !kernels[vecsize])
+        {
+            return -1;
+        }
+
+        err = clSetKernelArg(kernels[vecsize], 0, sizeof dest, &dest);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 1, sizeof src1, &src1);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 2, sizeof src2, &src2);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
+        err = clSetKernelArg(kernels[vecsize], 3, sizeof cmp, &cmp);
+        test_error_count(err, "Error: Cannot set kernel arg dest!\n");
     }
 
+    std::vector<char> ref(BUFFER_SIZE);
+    std::vector<char> sref(BUFFER_SIZE);
+    std::vector<char> src1_host(BUFFER_SIZE);
+    std::vector<char> src2_host(BUFFER_SIZE);
+    std::vector<char> cmp_host(BUFFER_SIZE);
+    std::vector<char> dest_host(BUFFER_SIZE);
+
     // We block the test as we are running over the range of compare values
     // "block the test" means "break the test into blocks"
     if( type_size[stype] == 4 )
@@ -396,111 +388,63 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
         cmp_stride = block_elements * step * (0xffffffffffffffffULL / 0x100000000ULL + 1);
 
     log_info("Testing...");
-    d = MTdataHolder(gRandomSeed);
     uint64_t i;
+
     for (i=0; i < blocks; i+=step)
     {
-        void *s1 = clEnqueueMapBuffer( queue, src1, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src1" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s1, stype, d);
-
-        void *s2 = clEnqueueMapBuffer( queue, src2, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map src2" ); goto exit; }
-        // Setup the input data to change for each block
-        initSrcBuffer( s2, stype, d);
-
-        void *s3 = clEnqueueMapBuffer( queue, cmp, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-        if( err ){ log_error( "Error: Could not map cmp" ); goto exit; }
-        // Setup the input data to change for each block
-        initCmpBuffer(s3, cmptype, i * cmp_stride, block_elements);
-
-        if( (err = clEnqueueUnmapMemObject( queue, src1, s1, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src1\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, src2, s2, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap src2\n" );  ++s_test_fail; goto exit; }
-        if( (err = clEnqueueUnmapMemObject( queue, cmp, s3, 0, NULL, NULL )))
-        { log_error( "Error: coult not unmap cmp\n" );  ++s_test_fail; goto exit; }
-
-        // Create the reference result
-        err = clEnqueueReadBuffer(queue, src1, CL_TRUE, 0, BUFFER_SIZE,
-                                  src1_host, 0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from src1 to src1_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
-        err = clEnqueueReadBuffer(queue, src2, CL_TRUE, 0, BUFFER_SIZE,
-                                  src2_host, 0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from src2 to src2_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
-        err = clEnqueueReadBuffer(queue, cmp, CL_TRUE, 0, BUFFER_SIZE, cmp_host,
-                                  0, NULL, NULL);
-        if (err)
-        {
-            log_error("Error: Reading buffer from cmp to cmp_host failed\n");
-            ++s_test_fail;
-            goto exit;
-        }
+        initSrcBuffer(src1_host.data(), stype, d);
+        initSrcBuffer(src2_host.data(), stype, d);
+        initCmpBuffer(cmp_host.data(), cmptype, i * cmp_stride, block_elements);
+
+        err = clEnqueueWriteBuffer(queue, src1, CL_FALSE, 0, BUFFER_SIZE,
+                                   src1_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src1");
+
+        err = clEnqueueWriteBuffer(queue, src2, CL_FALSE, 0, BUFFER_SIZE,
+                                   src2_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write src2");
+
+        err = clEnqueueWriteBuffer(queue, cmp, CL_FALSE, 0, BUFFER_SIZE,
+                                   cmp_host.data(), 0, NULL, NULL);
+        test_error_count(err, "Error: Could not write cmp");
 
         Select sfunc = (cmptype == ctype[stype][0]) ? vrefSelects[stype][0]
                                                     : vrefSelects[stype][1];
-        (*sfunc)(ref, src1_host, src2_host, cmp_host, block_elements);
+        (*sfunc)(ref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
         sfunc = (cmptype == ctype[stype][0]) ? refSelects[stype][0]
                                              : refSelects[stype][1];
-        (*sfunc)(sref, src1_host, src2_host, cmp_host, block_elements);
+        (*sfunc)(sref.data(), src1_host.data(), src2_host.data(),
+                 cmp_host.data(), block_elements);
 
-        for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
+        for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
         {
             size_t vector_size = element_count[vecsize] * type_size[stype];
             size_t vector_count =  (BUFFER_SIZE + vector_size - 1) / vector_size;
 
-            if((err = clSetKernelArg(kernels[vecsize], 0,  sizeof dest, &dest) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 1,  sizeof src1, &src1) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 2,  sizeof src2, &src2) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-            if((err = clSetKernelArg(kernels[vecsize], 3,  sizeof cmp, &cmp) ))
-            { log_error( "Error: Cannot set kernel arg dest! %d\n", err ); ++s_test_fail; goto exit; }
-
-            // Wipe destination
-            void *d = clEnqueueMapBuffer( queue, dest, CL_TRUE, CL_MAP_WRITE, 0, BUFFER_SIZE, 0, NULL, NULL, &err );
-            if( err ){ log_error( "Error: Could not map dest" );  ++s_test_fail; goto exit; }
-            memset( d, -1, BUFFER_SIZE );
-            if( (err = clEnqueueUnmapMemObject( queue, dest, d, 0, NULL, NULL ) ) ){ log_error( "Error: Could not unmap dest" ); ++s_test_fail; goto exit; }
+            const cl_int pattern = -1;
+            err = clEnqueueFillBuffer(queue, dest, &pattern, sizeof(cl_int), 0,
+                                      BUFFER_SIZE, 0, nullptr, nullptr);
+            test_error_count(err, "clEnqueueFillBuffer failed");
+
 
             err = clEnqueueNDRangeKernel(queue, kernels[vecsize], 1, NULL, &vector_count, NULL, 0, NULL, NULL);
-            if (err != CL_SUCCESS) {
-                log_error("clEnqueueNDRangeKernel failed errcode:%d\n", err);
-                ++s_test_fail;
-                goto exit;
-            }
+            test_error_count(err, "clEnqueueNDRangeKernel failed errcode\n");
 
             err = clEnqueueReadBuffer(queue, dest, CL_TRUE, 0, BUFFER_SIZE,
-                                      dest_host, 0, NULL, NULL);
-            if (err)
-            {
-                log_error(
-                    "Error: Reading buffer from dest to dest_host failed\n");
-                ++s_test_fail;
-                goto exit;
-            }
+                                      dest_host.data(), 0, NULL, NULL);
+            test_error_count(
+                err, "Error: Reading buffer from dest to dest_host failed\n");
 
-            if ((*checkResults[stype])(dest_host, vecsize == 0 ? sref : ref,
+            if ((*checkResults[stype])(dest_host.data(),
+                                       vecsize == 0 ? sref.data() : ref.data(),
                                        block_elements, element_count[vecsize])
                 != 0)
             {
                 log_error("vec_size:%d indx: 0x%16.16llx\n",
                           (int)element_count[vecsize], i);
-                ++s_test_fail;
-                goto exit;
+                return TEST_FAIL;
             }
         } // for vecsize
     } // for i
@@ -510,28 +454,6 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
     else
         log_info(" Wimpy Passed\n\n");
 
-exit:
-    if( src1 )  clReleaseMemObject( src1 );
-    if( src2 )  clReleaseMemObject( src2 );
-    if( cmp )   clReleaseMemObject( cmp );
-    if( dest)   clReleaseMemObject( dest );
-    if( ref )   free(ref );
-    if( sref )  free(sref );
-    if (src1_host) free(src1_host);
-    if (src2_host) free(src2_host);
-    if (cmp_host) free(cmp_host);
-    if (dest_host) free(dest_host);
-
-    for (vecsize = 0; vecsize < VECTOR_SIZE_COUNT; vecsize++) {
-        clReleaseKernel(kernels[vecsize]);
-        clReleaseProgram(programs[vecsize]);
-    }
-    ++s_test_cnt;
-    if (s_test_fail)
-    {
-        err = TEST_FAIL;
-        gFailCount++;
-    }
     return err;
 }
 
@@ -567,6 +489,16 @@ int test_select_short_short(cl_device_id deviceID, cl_context context, cl_comman
 {
     return doTest(queue, context, kshort, kshort, deviceID);
 }
+int test_select_half_ushort(cl_device_id deviceID, cl_context context,
+                            cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kushort, deviceID);
+}
+int test_select_half_short(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+{
+    return doTest(queue, context, khalf, kshort, deviceID);
+}
 int test_select_uint_uint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return doTest(queue, context, kuint, kuint, deviceID);
@@ -617,26 +549,17 @@ int test_select_double_long(cl_device_id deviceID, cl_context context, cl_comman
 }
 
 test_definition test_list[] = {
-    ADD_TEST( select_uchar_uchar ),
-    ADD_TEST( select_uchar_char ),
-    ADD_TEST( select_char_uchar ),
-    ADD_TEST( select_char_char ),
-    ADD_TEST( select_ushort_ushort ),
-    ADD_TEST( select_ushort_short ),
-    ADD_TEST( select_short_ushort ),
-    ADD_TEST( select_short_short ),
-    ADD_TEST( select_uint_uint ),
-    ADD_TEST( select_uint_int ),
-    ADD_TEST( select_int_uint ),
-    ADD_TEST( select_int_int ),
-    ADD_TEST( select_float_uint ),
-    ADD_TEST( select_float_int ),
-    ADD_TEST( select_ulong_ulong ),
-    ADD_TEST( select_ulong_long ),
-    ADD_TEST( select_long_ulong ),
-    ADD_TEST( select_long_long ),
-    ADD_TEST( select_double_ulong ),
-    ADD_TEST( select_double_long ),
+    ADD_TEST(select_uchar_uchar),   ADD_TEST(select_uchar_char),
+    ADD_TEST(select_char_uchar),    ADD_TEST(select_char_char),
+    ADD_TEST(select_ushort_ushort), ADD_TEST(select_ushort_short),
+    ADD_TEST(select_short_ushort),  ADD_TEST(select_short_short),
+    ADD_TEST(select_half_ushort),   ADD_TEST(select_half_short),
+    ADD_TEST(select_uint_uint),     ADD_TEST(select_uint_int),
+    ADD_TEST(select_int_uint),      ADD_TEST(select_int_int),
+    ADD_TEST(select_float_uint),    ADD_TEST(select_float_int),
+    ADD_TEST(select_ulong_ulong),   ADD_TEST(select_ulong_long),
+    ADD_TEST(select_long_ulong),    ADD_TEST(select_long_long),
+    ADD_TEST(select_double_ulong),  ADD_TEST(select_double_long),
 };
 
 const int test_num = ARRAY_SIZE( test_list );
diff --git a/test_conformance/select/test_select.h b/test_conformance/select/test_select.h
index c51ae13c2..5cd786022 100644
--- a/test_conformance/select/test_select.h
+++ b/test_conformance/select/test_select.h
@@ -28,18 +28,20 @@
 #endif
 
 // Defines the set of types we support (no support for double)
-typedef enum {
+typedef enum
+{
     kuchar = 0,
     kchar = 1,
     kushort = 2,
     kshort = 3,
-    kuint = 4,
-    kint = 5,
-    kfloat = 6,
-    kulong = 7,
-    klong = 8,
-    kdouble = 9,
-    kTypeCount  // always goes last
+    khalf = 4,
+    kuint = 5,
+    kint = 6,
+    kfloat = 7,
+    kulong = 8,
+    klong = 9,
+    kdouble = 10,
+    kTypeCount // always goes last
 } Type;
 
 
@@ -56,7 +58,8 @@ extern const size_t type_size[kTypeCount];
 extern const Type ctype[kTypeCount][2];
 
 // Reference functions for the primitive (non vector) type
-typedef void (*Select)(void *dest, void *src1, void *src2, void *cmp, size_t c);
+typedef void (*Select)(void *const dest, const void *const src1,
+                       const void *const src2, const void *const cmp, size_t c);
 extern Select refSelects[kTypeCount][2];
 
 // Reference functions for the primtive type but uses the vector
@@ -64,7 +67,8 @@ extern Select refSelects[kTypeCount][2];
 extern Select vrefSelects[kTypeCount][2];
 
 // Check functions for each output type
-typedef size_t (*CheckResults)(void *out1, void *out2, size_t count, size_t vectorSize);
+typedef size_t (*CheckResults)(const void *const out1, const void *const out2,
+                               size_t count, size_t vectorSize);
 extern CheckResults checkResults[kTypeCount];
 
 // Helpful macros
diff --git a/test_conformance/select/util_select.cpp b/test_conformance/select/util_select.cpp
index f9641e993..b85f54a76 100644
--- a/test_conformance/select/util_select.cpp
+++ b/test_conformance/select/util_select.cpp
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 #include "harness/errorHelpers.h"
 
 #include <stdio.h>
@@ -25,29 +24,28 @@
 //-----------------------------------------
 
 
-const char *type_name[kTypeCount] = {
-    "uchar", "char",
-    "ushort", "short",
-    "uint",   "int",
-    "float",  "ulong", "long", "double" };
+const char *type_name[kTypeCount] = { "uchar", "char", "ushort", "short",
+                                      "half",  "uint", "int",    "float",
+                                      "ulong", "long", "double" };
 
 const size_t type_size[kTypeCount] = {
-    sizeof(cl_uchar), sizeof(cl_char),
-    sizeof(cl_ushort), sizeof(cl_short),
-    sizeof(cl_uint), sizeof(cl_int),
-    sizeof(cl_float), sizeof(cl_ulong), sizeof(cl_long), sizeof( cl_double ) };
+    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_half),  sizeof(cl_uint), sizeof(cl_int),    sizeof(cl_float),
+    sizeof(cl_ulong), sizeof(cl_long), sizeof(cl_double)
+};
 
 const Type ctype[kTypeCount][2] = {
-    { kuchar,  kchar },     // uchar
-    { kuchar,  kchar },     // char
-    { kushort, kshort},     // ushort
-    { kushort, kshort},     // short
-    { kuint,   kint  },     // uint
-    { kuint,   kint  },     // int
-    { kuint,   kint  },     // float
-    { kulong,  klong },     // ulong
-    { kulong,  klong },     // long
-    { kulong,  klong }     // double
+    { kuchar, kchar }, // uchar
+    { kuchar, kchar }, // char
+    { kushort, kshort }, // ushort
+    { kushort, kshort }, // short
+    { kushort, kshort }, // half
+    { kuint, kint }, // uint
+    { kuint, kint }, // int
+    { kuint, kint }, // float
+    { kulong, klong }, // ulong
+    { kulong, klong }, // long
+    { kulong, klong } // double
 };
 
 
@@ -55,510 +53,594 @@ const Type ctype[kTypeCount][2] = {
 // Reference functions
 //-----------------------------------------
 
-void refselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i) {
         d[i] = m[i] ? y[i] : x[i];
     }
 }
 
-void refselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_short*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*)dest;
-    x = (cl_int*)src1;
-    y = (cl_int*)src2;
-    m = (cl_int*)cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void refselect_1u32(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*)dest;
-    x = (cl_uint*)src1;
-    y = (cl_uint*)src2;
-    m = (cl_int*)cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u8u(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u16u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u32u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1i64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_1u64u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_hhi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_hhu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_int *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = m[i] ? y[i] : x[i];
+}
+
+void refselect_ffi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ffu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddi(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_long *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void refselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void refselect_ddu(void *const dest, const void *const src1,
+                   const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = m[i] ? y[i] : x[i];
 }
 
-void vrefselect_1i8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y, *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_char*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1u8(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y;
-    cl_char *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_char*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_char *const m = (cl_char *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80) ? y[i] : x[i];
 }
 
-void vrefselect_1i16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y, *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_short*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_short *const m = (cl_short *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1u16(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y;
-    cl_short *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*)src1;
-    y = (cl_ushort*)src2;
-    m = (cl_short*)cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000) ? y[i] : x[i];
 }
 
-void vrefselect_1i32(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y, *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_int*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1u32(void *dest, void *src1, void *src2, void *cmp, size_t count){
+void vrefselect_1u32(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
 
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_1i64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y, *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_long*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_1i8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_char *d, *x, *y;
-    cl_uchar *m;
-    d = (cl_char*) dest;
-    x = (cl_char*) src1;
-    y = (cl_char*) src2;
-    m = (cl_uchar*) cmp;
+    cl_char *const d = (cl_char *)dest;
+    const cl_char *const x = (cl_char *)src1;
+    const cl_char *const y = (cl_char *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1u8u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u8u(void *const dest, const void *const src1,
+                     const void *const src2, const void *const cmp,
+                     size_t count)
+{
     size_t i;
-    cl_uchar *d, *x, *y, *m;
-    d = (cl_uchar*) dest;
-    x = (cl_uchar*) src1;
-    y = (cl_uchar*) src2;
-    m = (cl_uchar*) cmp;
+    cl_uchar *const d = (cl_uchar *)dest;
+    const cl_uchar *const x = (cl_uchar *)src1;
+    const cl_uchar *const y = (cl_uchar *)src2;
+    const cl_uchar *const m = (cl_uchar *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80U) ? y[i] : x[i];
 }
 
-void vrefselect_1i16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_short *d, *x, *y;
-    cl_ushort *m;
-    d = (cl_short*) dest;
-    x = (cl_short*) src1;
-    y = (cl_short*) src2;
-    m = (cl_ushort*) cmp;
+    cl_short *const d = (cl_short *)dest;
+    const cl_short *const x = (cl_short *)src1;
+    const cl_short *const y = (cl_short *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u16u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u16u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ushort *d, *x, *y, *m;
-    d = (cl_ushort*) dest;
-    x = (cl_ushort*) src1;
-    y = (cl_ushort*) src2;
-    m = (cl_ushort*) cmp;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_int *d, *x, *y;
-    cl_uint *m;
-    d = (cl_int*) dest;
-    x = (cl_int*) src1;
-    y = (cl_int*) src2;
-    m = (cl_uint*) cmp;
+    cl_int *const d = (cl_int *)dest;
+    const cl_int *const x = (cl_int *)src1;
+    const cl_int *const y = (cl_int *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1u32u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u32u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y, *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_1i64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1i64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_long *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_long*) dest;
-    x = (cl_long*) src1;
-    y = (cl_long*) src2;
-    m = (cl_ulong*) cmp;
+    cl_long *const d = (cl_long *)dest;
+    const cl_long *const x = (cl_long *)src1;
+    const cl_long *const y = (cl_long *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_1u64u(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_1u64u(void *const dest, const void *const src1,
+                      const void *const src2, const void *const cmp,
+                      size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y, *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
-void vrefselect_ffi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_hhi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_short *const m = (cl_short *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000) ? y[i] : x[i];
+}
+
+void vrefselect_hhu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
+    size_t i;
+    cl_ushort *const d = (cl_ushort *)dest;
+    const cl_ushort *const x = (cl_ushort *)src1;
+    const cl_ushort *const y = (cl_ushort *)src2;
+    const cl_ushort *const m = (cl_ushort *)cmp;
+    for (i = 0; i < count; ++i) d[i] = (m[i] & 0x8000U) ? y[i] : x[i];
+}
+
+void vrefselect_ffi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_int *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_int*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_int *const m = (cl_int *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000) ? y[i] : x[i];
 }
 
-void vrefselect_ffu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ffu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_uint *d, *x, *y;
-    cl_uint *m;
-    d = (cl_uint*) dest;
-    x = (cl_uint*) src1;
-    y = (cl_uint*) src2;
-    m = (cl_uint*) cmp;
+    cl_uint *const d = (cl_uint *)dest;
+    const cl_uint *const x = (cl_uint *)src1;
+    const cl_uint *const y = (cl_uint *)src2;
+    const cl_uint *const m = (cl_uint *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x80000000U) ? y[i] : x[i];
 }
 
-void vrefselect_ddi(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddi(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_long *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_long*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_long *const m = (cl_long *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000LL) ? y[i] : x[i];
 }
 
-void vrefselect_ddu(void *dest, void *src1, void *src2, void *cmp, size_t count) {
+void vrefselect_ddu(void *const dest, const void *const src1,
+                    const void *const src2, const void *const cmp, size_t count)
+{
     size_t i;
-    cl_ulong *d, *x, *y;
-    cl_ulong *m;
-    d = (cl_ulong*) dest;
-    x = (cl_ulong*) src1;
-    y = (cl_ulong*) src2;
-    m = (cl_ulong*) cmp;
+    cl_ulong *const d = (cl_ulong *)dest;
+    const cl_ulong *const x = (cl_ulong *)src1;
+    const cl_ulong *const y = (cl_ulong *)src2;
+    const cl_ulong *const m = (cl_ulong *)cmp;
     for (i=0; i < count; ++i)
         d[i] = (m[i] & 0x8000000000000000ULL) ? y[i] : x[i];
 }
 
 // Define refSelects
-Select refSelects[kTypeCount][2] =  {
-    { refselect_1u8u,  refselect_1u8  }, // cl_uchar
-    { refselect_1i8u,  refselect_1i8  }, // char
+Select refSelects[kTypeCount][2] = {
+    { refselect_1u8u, refselect_1u8 }, // cl_uchar
+    { refselect_1i8u, refselect_1i8 }, // char
     { refselect_1u16u, refselect_1u16 }, // ushort
     { refselect_1i16u, refselect_1i16 }, // short
+    { refselect_hhu, refselect_hhi }, // half
     { refselect_1u32u, refselect_1u32 }, // uint
     { refselect_1i32u, refselect_1i32 }, // int
-    { refselect_ffu,   refselect_ffi  }, // float
+    { refselect_ffu, refselect_ffi }, // float
     { refselect_1u64u, refselect_1u64 }, // ulong
     { refselect_1i64u, refselect_1i64 }, // long
-    { refselect_ddu,   refselect_ddi }   // double
+    { refselect_ddu, refselect_ddi } // double
 };
 
 // Define vrefSelects (vector refSelects)
-Select vrefSelects[kTypeCount][2] =  {
-    { vrefselect_1u8u,  vrefselect_1u8  }, // cl_uchar
-    { vrefselect_1i8u,  vrefselect_1i8  }, // char
+Select vrefSelects[kTypeCount][2] = {
+    { vrefselect_1u8u, vrefselect_1u8 }, // cl_uchar
+    { vrefselect_1i8u, vrefselect_1i8 }, // char
     { vrefselect_1u16u, vrefselect_1u16 }, // ushort
     { vrefselect_1i16u, vrefselect_1i16 }, // short
+    { vrefselect_hhu, vrefselect_hhi }, // half
     { vrefselect_1u32u, vrefselect_1u32 }, // uint
     { vrefselect_1i32u, vrefselect_1i32 }, // int
-    { vrefselect_ffu,   vrefselect_ffi  }, // float
+    { vrefselect_ffu, vrefselect_ffi }, // float
     { vrefselect_1u64u, vrefselect_1u64 }, // ulong
     { vrefselect_1i64u, vrefselect_1i64 }, // long
-    { vrefselect_ddu,   vrefselect_ddi  }     // double
+    { vrefselect_ddu, vrefselect_ddi } // double
 };
 
 
 //-----------------------------------------
 // Check functions
 //-----------------------------------------
-size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uchar *t = (const cl_uchar *) test;
-    const cl_uchar *c = (const cl_uchar *) correct;
+size_t check_uchar(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uchar *const t = (const cl_uchar *)test;
+    const cl_uchar *const c = (const cl_uchar *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -576,9 +658,11 @@ size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_char *t = (const cl_char *) test;
-    const cl_char *c = (const cl_char *) correct;
+size_t check_char(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_char *const t = (const cl_char *)test;
+    const cl_char *const c = (const cl_char *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -597,9 +681,11 @@ size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ushort *t = (const cl_ushort *) test;
-    const cl_ushort *c = (const cl_ushort *) correct;
+size_t check_ushort(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -618,9 +704,11 @@ size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_short(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_short *t = (const cl_short *) test;
-    const cl_short *c = (const cl_short *) correct;
+size_t check_short(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_short *const t = (const cl_short *)test;
+    const cl_short *const c = (const cl_short *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -639,9 +727,11 @@ size_t check_short(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_uint(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -660,9 +750,11 @@ size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_int *t = (const cl_int *) test;
-    const cl_int *c = (const cl_int *) correct;
+size_t check_int(const void *const test, const void *const correct,
+                 size_t count, size_t vector_size)
+{
+    const cl_int *const t = (const cl_int *)test;
+    const cl_int *const c = (const cl_int *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -682,9 +774,11 @@ size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_ulong(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -703,9 +797,11 @@ size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size)
     return 0;
 }
 
-size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
-    const cl_long *t = (const cl_long *) test;
-    const cl_long *c = (const cl_long *) correct;
+size_t check_long(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_long *const t = (const cl_long *)test;
+    const cl_long *const c = (const cl_long *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -724,9 +820,36 @@ size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
     return 0;
 }
 
-size_t check_float( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_uint *t = (const cl_uint *) test;
-    const cl_uint *c = (const cl_uint *) correct;
+size_t check_half(const void *const test, const void *const correct,
+                  size_t count, size_t vector_size)
+{
+    const cl_ushort *const t = (const cl_ushort *)test;
+    const cl_ushort *const c = (const cl_ushort *)correct;
+    size_t i;
+
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++) /* Allow nans to be binary different */
+            if ((t[i] != c[i])
+                && !(isnan(((cl_half *)correct)[i])
+                     && isnan(((cl_half *)test)[i])))
+            {
+                log_error("\n(check_half) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%4.4x vs 0x%4.4x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
+
+    return 0;
+}
+
+size_t check_float(const void *const test, const void *const correct,
+                   size_t count, size_t vector_size)
+{
+    const cl_uint *const t = (const cl_uint *)test;
+    const cl_uint *const c = (const cl_uint *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -746,9 +869,11 @@ size_t check_float( void *test, void *correct, size_t count, size_t vector_size
     return 0;
 }
 
-size_t check_double( void *test, void *correct, size_t count, size_t vector_size ) {
-    const cl_ulong *t = (const cl_ulong *) test;
-    const cl_ulong *c = (const cl_ulong *) correct;
+size_t check_double(const void *const test, const void *const correct,
+                    size_t count, size_t vector_size)
+{
+    const cl_ulong *const t = (const cl_ulong *)test;
+    const cl_ulong *const c = (const cl_ulong *)correct;
     size_t i;
 
     if (memcmp(t, c, count * sizeof(c[0])) != 0)
@@ -770,5 +895,7 @@ size_t check_double( void *test, void *correct, size_t count, size_t vector_size
 }
 
 CheckResults checkResults[kTypeCount] = {
-    check_uchar, check_char, check_ushort, check_short, check_uint,
-    check_int, check_float, check_ulong, check_long, check_double };
+    check_uchar, check_char, check_ushort, check_short,
+    check_half,  check_uint, check_int,    check_float,
+    check_ulong, check_long, check_double
+};
diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
new file mode 100644
index 000000000..491271874
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm32
@@ -0,0 +1,35 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 17
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %11 = OpLabel
+         %12 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %13 = OpCompositeExtract %uint %12 0
+         %14 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %13
+         %15 = OpLoad %half %14
+         %16 = OpFNegate %half %15
+               OpStore %14 %16
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
new file mode 100644
index 000000000..9c7e3d6df
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/op_neg_half.spvasm64
@@ -0,0 +1,39 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 20
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "op_neg_half" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %10 = OpTypeFunction %void %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %10
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %12 = OpLabel
+         %13 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %14 = OpCompositeExtract %ulong %13 0
+         %15 = OpShiftLeftLogical %ulong %14 %ulong_32
+         %16 = OpShiftRightArithmetic %ulong %15 %ulong_32
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %16
+         %18 = OpLoad %half %17
+         %19 = OpFNegate %half %18
+               OpStore %17 %19
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32
new file mode 100644
index 000000000..985b52622
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_extract" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %13 = OpTypeFunction %void %_ptr_CrossWorkgroup_v8half %_ptr_CrossWorkgroup_half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %13
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %14 = OpFunctionParameter %uint
+         %15 = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %in %17
+         %19 = OpLoad %v8half %18
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %4 %17
+         %21 = OpVectorExtractDynamic %half %19 %14
+               OpStore %20 %21
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64
new file mode 100644
index 000000000..dd14f66c9
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_extract.spvasm64
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 26
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_extract" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %uint = OpTypeInt 32 0
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_v8half %_ptr_CrossWorkgroup_half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %15
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %16 = OpFunctionParameter %uint
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %in %21
+         %23 = OpLoad %v8half %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %4 %21
+         %25 = OpVectorExtractDynamic %half %23 %16
+               OpStore %24 %25
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32
new file mode 100644
index 000000000..278129388
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm32
@@ -0,0 +1,43 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_insert" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %13 = OpTypeFunction %void %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_v8half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %13
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+         %14 = OpFunctionParameter %uint
+         %15 = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %17
+         %19 = OpLoad %half %18
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %4 %17
+         %21 = OpLoad %v8half %20
+         %22 = OpVectorInsertDynamic %v8half %21 %19 %14
+               OpStore %20 %22
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64
new file mode 100644
index 000000000..f140fc253
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_half8_insert.spvasm64
@@ -0,0 +1,48 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 27
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Vector16
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_half8_insert" %gl_GlobalInvocationID
+               OpName %in "in"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %in FuncParamAttr NoCapture
+               OpDecorate %4 FuncParamAttr NoCapture
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %uint = OpTypeInt 32 0
+       %void = OpTypeVoid
+      %half = OpTypeFloat 16
+    %v8half = OpTypeVector %half 8
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+%_ptr_CrossWorkgroup_v8half = OpTypePointer CrossWorkgroup %v8half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_v8half %uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %15
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+          %4 = OpFunctionParameter %_ptr_CrossWorkgroup_v8half
+         %16 = OpFunctionParameter %uint
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %21
+         %23 = OpLoad %half %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v8half %4 %21
+         %25 = OpLoad %v8half %24
+         %26 = OpVectorInsertDynamic %v8half %25 %23 %16
+               OpStore %24 %26
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
new file mode 100644
index 000000000..6fda7d8f1
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm32
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 25
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %16 = OpLabel
+         %17 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %18 = OpCompositeExtract %uint %17 0
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %18
+         %20 = OpLoad %v4half %19 Aligned 8
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %18
+         %22 = OpLoad %half %21 Aligned 2
+         %23 = OpVectorTimesScalar %v4half %20 %22
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %18
+               OpStore %24 %23 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64 b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
new file mode 100644
index 000000000..fa2d52210
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/vector_times_scalar_half.spvasm64
@@ -0,0 +1,50 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 28
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "vector_times_scalar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpDecorate %5 FuncParamAttr NoCapture
+          %5 = OpDecorationGroup
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpGroupDecorate %5 %res %lhs %rhs
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+   %ulong_32 = OpConstant %ulong 32
+       %void = OpTypeVoid
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+   %v4half = OpTypeVector %half 4
+%_ptr_CrossWorkgroup_v4half = OpTypePointer CrossWorkgroup %v4half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_v4half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_v4half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+         %17 = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %lhs %21
+         %23 = OpLoad %v4half %22 Aligned 8
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %21
+         %25 = OpLoad %half %24 Aligned 2
+         %26 = OpVectorTimesScalar %v4half %23 %25
+         %27 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_v4half %res %21
+               OpStore %27 %26 Aligned 8
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp
index e3dc1f349..5009be931 100644
--- a/test_conformance/spirv_new/test_op_negate.cpp
+++ b/test_conformance/spirv_new/test_op_negate.cpp
@@ -32,6 +32,15 @@ int test_negation(cl_device_id deviceID,
             return 0;
         }
     }
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
 
     cl_int err = CL_SUCCESS;
     int num = (int)h_in.size();
@@ -73,29 +82,28 @@ int test_negation(cl_device_id deviceID,
     return 0;
 }
 
-#define TEST_NEGATION(TYPE, Tv, OP, FUNC)       \
-    TEST_SPIRV_FUNC(OP##_##TYPE)                \
-    {                                           \
-        int num = 1 << 20;                      \
-        std::vector<Tv> in(num);                \
-        RandomSeed seed(gRandomSeed);           \
-        for (int i = 0; i < num; i++) {         \
-            in[i] = genrand<Tv>(seed);          \
-        }                                       \
-        return test_negation<Tv>(deviceID,      \
-                                 context,       \
-                                 queue,         \
-                                 #TYPE,         \
-                                 #OP,           \
-                                 in, FUNC);     \
-    }                                           \
+#define TEST_NEGATION(TYPE, Tv, OP, FUNC)                                      \
+    TEST_SPIRV_FUNC(OP##_##TYPE)                                               \
+    {                                                                          \
+        int num = 1 << 20;                                                     \
+        std::vector<Tv> in(num);                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Tv>(seed);                                         \
+        }                                                                      \
+        return test_negation<Tv>(deviceID, context, queue, #TYPE, #OP, in,     \
+                                 FUNC);                                        \
+    }
 
 
+#define TEST_NEG_HALF TEST_NEGATION(half, cl_half, op_neg, negOpHalf)
 #define TEST_NEG(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_neg, negOp<cl_##TYPE>)
 #define TEST_NOT(TYPE)        TEST_NEGATION(TYPE, cl_##TYPE, op_not, notOp<cl_##TYPE>)
 #define TEST_NEG_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_neg, (negOpVec<cl_##TYPE##N, N>))
 #define TEST_NOT_VEC(TYPE, N) TEST_NEGATION(TYPE##N, cl_##TYPE##N, op_not, (notOpVec<cl_##TYPE##N, N>))
 
+TEST_NEG_HALF
 TEST_NEG(float)
 TEST_NEG(double)
 TEST_NEG(int)
diff --git a/test_conformance/spirv_new/test_op_vector_extract.cpp b/test_conformance/spirv_new/test_op_vector_extract.cpp
index fe1f82538..f77aa7a2e 100644
--- a/test_conformance/spirv_new/test_op_vector_extract.cpp
+++ b/test_conformance/spirv_new/test_op_vector_extract.cpp
@@ -25,6 +25,17 @@ int test_extract(cl_device_id deviceID, cl_context context,
             return 0;
         }
     }
+
+    if (std::string(name).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
 
     clProgramWrapper prog;
@@ -76,27 +87,30 @@ int test_extract(cl_device_id deviceID, cl_context context,
     return 0;
 }
 
-#define TEST_VECTOR_EXTRACT(TYPE, N)                        \
-    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_extract)          \
-    {                                                       \
-        typedef cl_##TYPE##N Tv;                            \
-        typedef cl_##TYPE Ts;                               \
-        const int num = 1 << 20;                            \
-        std::vector<Tv> in(num);                            \
-        const char *name = "vector_" #TYPE #N "_extract";   \
-                                                            \
-        RandomSeed seed(gRandomSeed);                       \
-                                                            \
-        for (int i = 0; i < num; i++) {                     \
-            in[i] = genrand<Tv>(seed);                      \
-        }                                                   \
-                                                            \
-        return test_extract<Tv, Ts>(deviceID,               \
-                                    context, queue,         \
-                                    name,                   \
-                                    in, N);                 \
+#define TEST_VECTOR_EXTRACT(TYPE, N)                                           \
+    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_extract)                             \
+    {                                                                          \
+        if (sizeof(cl_##TYPE) == 2)                                            \
+        {                                                                      \
+            PASSIVE_REQUIRE_FP16_SUPPORT(deviceID);                            \
+        }                                                                      \
+        typedef cl_##TYPE##N Tv;                                               \
+        typedef cl_##TYPE Ts;                                                  \
+        const int num = 1 << 20;                                               \
+        std::vector<Tv> in(num);                                               \
+        const char *name = "vector_" #TYPE #N "_extract";                      \
+                                                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+                                                                               \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Tv>(seed);                                         \
+        }                                                                      \
+                                                                               \
+        return test_extract<Tv, Ts>(deviceID, context, queue, name, in, N);    \
     }
 
+TEST_VECTOR_EXTRACT(half, 8)
 TEST_VECTOR_EXTRACT(int, 4)
 TEST_VECTOR_EXTRACT(float, 4)
 TEST_VECTOR_EXTRACT(long, 2)
diff --git a/test_conformance/spirv_new/test_op_vector_insert.cpp b/test_conformance/spirv_new/test_op_vector_insert.cpp
index 0749c14ab..62fc78cb5 100644
--- a/test_conformance/spirv_new/test_op_vector_insert.cpp
+++ b/test_conformance/spirv_new/test_op_vector_insert.cpp
@@ -25,6 +25,17 @@ int test_insert(cl_device_id deviceID, cl_context context,
             return 0;
         }
     }
+
+    if (std::string(name).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
     clProgramWrapper prog;
     err = get_program_with_il(prog, deviceID, context, name);
@@ -94,27 +105,30 @@ int test_insert(cl_device_id deviceID, cl_context context,
     return 0;
 }
 
-#define TEST_VECTOR_INSERT(TYPE, N)                         \
-    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_insert)           \
-    {                                                       \
-        typedef cl_##TYPE##N Tv;                            \
-        typedef cl_##TYPE Ts;                               \
-        const int num = 1 << 20;                            \
-        std::vector<Ts> in(num);                            \
-        const char *name = "vector_" #TYPE #N "_insert";    \
-                                                            \
-        RandomSeed seed(gRandomSeed);                       \
-                                                            \
-        for (int i = 0; i < num; i++) {                     \
-            in[i] = genrand<Ts>(seed);                      \
-        }                                                   \
-                                                            \
-        return test_insert<Ts, Tv>(deviceID,                \
-                                   context, queue,          \
-                                   name,                    \
-                                   in, N);                  \
+#define TEST_VECTOR_INSERT(TYPE, N)                                            \
+    TEST_SPIRV_FUNC(op_vector_##TYPE##N##_insert)                              \
+    {                                                                          \
+        if (sizeof(cl_##TYPE) == 2)                                            \
+        {                                                                      \
+            PASSIVE_REQUIRE_FP16_SUPPORT(deviceID);                            \
+        }                                                                      \
+        typedef cl_##TYPE##N Tv;                                               \
+        typedef cl_##TYPE Ts;                                                  \
+        const int num = 1 << 20;                                               \
+        std::vector<Ts> in(num);                                               \
+        const char *name = "vector_" #TYPE #N "_insert";                       \
+                                                                               \
+        RandomSeed seed(gRandomSeed);                                          \
+                                                                               \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = genrand<Ts>(seed);                                         \
+        }                                                                      \
+                                                                               \
+        return test_insert<Ts, Tv>(deviceID, context, queue, name, in, N);     \
     }
 
+TEST_VECTOR_INSERT(half, 8)
 TEST_VECTOR_INSERT(int, 4)
 TEST_VECTOR_INSERT(float, 4)
 TEST_VECTOR_INSERT(long, 2)
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 0859668cb..0be4e8b71 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -17,6 +17,8 @@ or Khronos Conformance Test Source License Agreement as executed between Khronos
 #include <sstream>
 #include <string>
 
+using half = cl_half;
+
 template<typename Tv, typename Ts>
 int test_vector_times_scalar(cl_device_id deviceID,
                              cl_context context,
@@ -32,6 +34,16 @@ int test_vector_times_scalar(cl_device_id deviceID,
         }
     }
 
+    if (std::string(Tname).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info("Extension cl_khr_fp16 not supported; skipping half "
+                     "tests.\n");
+            return 0;
+        }
+    }
+
     cl_int err = CL_SUCCESS;
     int num = (int)h_lhs.size();
     size_t lhs_bytes = num * sizeof(Tv);
@@ -171,5 +183,7 @@ int test_vector_times_scalar(cl_device_id deviceID,
                                                 lhs, rhs);      \
     }
 
+
 TEST_VECTOR_TIMES_SCALAR(float, 4)
 TEST_VECTOR_TIMES_SCALAR(double, 4)
+TEST_VECTOR_TIMES_SCALAR(half, 4)
diff --git a/test_conformance/spirv_new/types.hpp b/test_conformance/spirv_new/types.hpp
index e7fceba0c..728b24455 100644
--- a/test_conformance/spirv_new/types.hpp
+++ b/test_conformance/spirv_new/types.hpp
@@ -43,6 +43,8 @@ VEC_NOT_EQ_FUNC(cl_float, 2)
 VEC_NOT_EQ_FUNC(cl_float, 4)
 VEC_NOT_EQ_FUNC(cl_double, 2)
 VEC_NOT_EQ_FUNC(cl_double, 4)
+VEC_NOT_EQ_FUNC(cl_half, 2)
+VEC_NOT_EQ_FUNC(cl_half, 4)
 
 template<typename T>
 bool isNotEqual(const T &lhs, const T &rhs)
@@ -109,6 +111,9 @@ GENRAND_REAL_FUNC(cl_float, 2)
 GENRAND_REAL_FUNC(cl_float, 4)
 GENRAND_REAL_FUNC(cl_double, 2)
 GENRAND_REAL_FUNC(cl_double, 4)
+GENRAND_REAL_FUNC(cl_half, 2)
+GENRAND_REAL_FUNC(cl_half, 4)
+GENRAND_REAL_FUNC(cl_half, 8)
 
 template<> inline cl_half genrandReal<cl_half>(RandomSeed &seed)
 {
@@ -157,6 +162,8 @@ Tv negOp(Tv in)
     return -in;
 }
 
+inline cl_half negOpHalf(cl_half v) { return v ^ 0x8000; }
+
 template<typename Tv>
 Tv notOp(Tv in)
 {
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index f779ef370..d9dfc3b8c 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -483,29 +483,30 @@ template <typename Ty, ShuffleOp operation> struct SHF
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                            const WorkGroupParams &test_params)
     {
-        int ii, i, j, k, n;
+        int ii, k;
+        size_t n;
         cl_uint l;
-        int nw = test_params.local_workgroup_size;
-        int ns = test_params.subgroup_size;
+        size_t nw = test_params.local_workgroup_size;
+        size_t ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        int nj = (nw + ns - 1) / ns;
+        size_t nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
 
         for (k = 0; k < ng; ++k)
         { // for each work_group
-            for (j = 0; j < nw; ++j)
+            for (size_t j = 0; j < nw; ++j)
             { // inside the work_group
                 mx[j] = x[j]; // read host inputs for work_group
                 my[j] = y[j]; // read device outputs for work_group
             }
 
-            for (j = 0; j < nj; ++j)
+            for (size_t j = 0; j < nj; ++j)
             { // for each subgroup
                 ii = j * ns;
                 n = ii + ns > nw ? nw - ii : ns;
 
-                for (i = 0; i < n; ++i)
+                for (size_t i = 0; i < n; ++i)
                 { // inside the subgroup
                   // shuffle index storage
                     int midx = 4 * ii + 4 * i + 2;
diff --git a/test_conformance/subgroups/subhelpers.cpp b/test_conformance/subgroups/subhelpers.cpp
index 11268f640..440cde20f 100644
--- a/test_conformance/subgroups/subhelpers.cpp
+++ b/test_conformance/subgroups/subhelpers.cpp
@@ -206,7 +206,7 @@ void set_last_workgroup_params(int non_uniform_size, int &number_of_subgroups,
 }
 
 void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
-                                  int sb_size)
+                                  size_t sb_size)
 {
     // max product is 720, cl_half has enough precision for it
     const std::vector<cl_ulong> non_one_values{ 2, 3, 4, 5, 6 };
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index bcb523cf8..ed92e5d3c 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -44,7 +44,7 @@ cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
 // for each subgroup values defined different values
 // for rest of workitems set 1 shuffle values
 void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
-                                  int sb_size);
+                                  size_t sb_size);
 
 struct WorkGroupParams
 {
diff --git a/test_conformance/subgroups/test_workitem.cpp b/test_conformance/subgroups/test_workitem.cpp
index b69f31385..5b2a5eb83 100644
--- a/test_conformance/subgroups/test_workitem.cpp
+++ b/test_conformance/subgroups/test_workitem.cpp
@@ -36,7 +36,7 @@ struct get_test_data
 };
 
 static int check_group(const get_test_data *result, int nw, cl_uint ensg,
-                       int maxwgs)
+                       size_t maxwgs)
 {
     int first = -1;
     int last = -1;
@@ -168,7 +168,7 @@ static int check_group(const get_test_data *result, int nw, cl_uint ensg,
 
         j = (result[first].subGroupSize + 31) / 32 * result[i].subGroupId
             + (result[i].subGroupLocalId >> 5);
-        if (j < sizeof(hit) / 4)
+        if (j < static_cast<int>(sizeof(hit) / 4))
         {
             cl_uint b = 1U << (result[i].subGroupLocalId & 0x1fU);
             if ((hit[j] & b) != 0)
@@ -191,7 +191,7 @@ int test_work_item_functions(cl_device_id device, cl_context context,
     static const size_t lsize = 200;
     int error;
     int i, j, k, q, r, nw;
-    int maxwgs;
+    size_t maxwgs;
     cl_uint ensg;
     size_t global;
     size_t local;
@@ -235,7 +235,7 @@ int test_work_item_functions(cl_device_id device, cl_context context,
     error = get_max_allowed_work_group_size(context, kernel, &local, NULL);
     if (error != 0) return error;
 
-    maxwgs = (int)local;
+    maxwgs = local;
 
     // Limit it a bit so we have muliple work groups
     // Ideally this will still be large enough to give us multiple subgroups
diff --git a/test_conformance/vulkan/main.cpp b/test_conformance/vulkan/main.cpp
index 5901420ad..3d7b30e76 100644
--- a/test_conformance/vulkan/main.cpp
+++ b/test_conformance/vulkan/main.cpp
@@ -52,7 +52,8 @@ static void params_reset()
 }
 
 extern int test_buffer_common(cl_device_id device_, cl_context context_,
-                              cl_command_queue queue_, int numElements_);
+                              cl_command_queue queue_, int numElements_,
+                              float use_fence);
 extern int test_image_common(cl_device_id device_, cl_context context_,
                              cl_command_queue queue_, int numElements_);
 
@@ -61,7 +62,7 @@ int test_buffer_single_queue(cl_device_id device_, cl_context context_,
 {
     params_reset();
     log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiple_queue(cl_device_id device_, cl_context context_,
                                cl_command_queue queue_, int numElements_)
@@ -69,7 +70,7 @@ int test_buffer_multiple_queue(cl_device_id device_, cl_context context_,
     params_reset();
     numCQ = 2;
     log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiImport_sameCtx(cl_device_id device_, cl_context context_,
                                     cl_command_queue queue_, int numElements_)
@@ -78,7 +79,7 @@ int test_buffer_multiImport_sameCtx(cl_device_id device_, cl_context context_,
     multiImport = true;
     log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
              "IN SAME CONTEXT...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
 }
 int test_buffer_multiImport_diffCtx(cl_device_id device_, cl_context context_,
                                     cl_command_queue queue_, int numElements_)
@@ -88,7 +89,45 @@ int test_buffer_multiImport_diffCtx(cl_device_id device_, cl_context context_,
     multiCtx = true;
     log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
              "IN DIFFERENT CONTEXT...... \n\n");
-    return test_buffer_common(device_, context_, queue_, numElements_);
+    return test_buffer_common(device_, context_, queue_, numElements_, false);
+}
+int test_buffer_single_queue_fence(cl_device_id device_, cl_context context_,
+                                   cl_command_queue queue_, int numElements_)
+{
+    params_reset();
+    log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiple_queue_fence(cl_device_id device_, cl_context context_,
+                                     cl_command_queue queue_, int numElements_)
+{
+    params_reset();
+    numCQ = 2;
+    log_info("RUNNING TEST WITH TWO QUEUE...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiImport_sameCtx_fence(cl_device_id device_,
+                                          cl_context context_,
+                                          cl_command_queue queue_,
+                                          int numElements_)
+{
+    params_reset();
+    multiImport = true;
+    log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
+             "IN SAME CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
+}
+int test_buffer_multiImport_diffCtx_fence(cl_device_id device_,
+                                          cl_context context_,
+                                          cl_command_queue queue_,
+                                          int numElements_)
+{
+    params_reset();
+    multiImport = true;
+    multiCtx = true;
+    log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT "
+             "IN DIFFERENT CONTEXT...... \n\n");
+    return test_buffer_common(device_, context_, queue_, numElements_, true);
 }
 int test_image_single_queue(cl_device_id device_, cl_context context_,
                             cl_command_queue queue_, int numElements_)
@@ -110,6 +149,10 @@ test_definition test_list[] = { ADD_TEST(buffer_single_queue),
                                 ADD_TEST(buffer_multiple_queue),
                                 ADD_TEST(buffer_multiImport_sameCtx),
                                 ADD_TEST(buffer_multiImport_diffCtx),
+                                ADD_TEST(buffer_single_queue_fence),
+                                ADD_TEST(buffer_multiple_queue_fence),
+                                ADD_TEST(buffer_multiImport_sameCtx_fence),
+                                ADD_TEST(buffer_multiImport_diffCtx_fence),
                                 ADD_TEST(image_single_queue),
                                 ADD_TEST(image_multiple_queue),
                                 ADD_TEST(consistency_external_buffer),
diff --git a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
index 9b0bc9de7..5390ef690 100644
--- a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
+++ b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
@@ -21,6 +21,7 @@
 #include <assert.h>
 #include <vector>
 #include <iostream>
+#include <memory>
 #include <string.h>
 #include "harness/errorHelpers.h"
 
@@ -82,7 +83,8 @@ __kernel void checkKernel(__global unsigned char *ptr, int size, int expVal, __g
 int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
                             cl_command_queue &cmd_queue2, cl_kernel *kernel,
                             cl_kernel &verify_kernel, VulkanDevice &vkDevice,
-                            uint32_t numBuffers, uint32_t bufferSize)
+                            uint32_t numBuffers, uint32_t bufferSize,
+                            bool use_fence)
 {
     int err = CL_SUCCESS;
     size_t global_work_size[1];
@@ -117,6 +119,7 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
@@ -136,10 +139,17 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
 
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
@@ -227,16 +237,27 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
             for (uint32_t iter = 0; iter < maxIter; iter++)
             {
 
-                if (iter == 0)
+                if (use_fence)
                 {
-                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    fence->reset();
+                    vkQueue.submit(vkCommandBuffer, fence);
+                    fence->wait();
                 }
                 else
                 {
-                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                   vkVk2CLSemaphore);
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
                 }
-                clVk2CLExternalSemaphore->wait(cmd_queue1);
+
 
                 err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
                                      (void *)&bufferSize);
@@ -286,7 +307,14 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
                     goto CLEANUP;
                 }
 
-                if (iter != (maxIter - 1))
+                if (use_fence)
+                {
+                    clFlush(cmd_queue1);
+                    clFlush(cmd_queue2);
+                    clFinish(cmd_queue1);
+                    clFinish(cmd_queue2);
+                }
+                else if (!use_fence && iter != (maxIter - 1))
                 {
                     clCl2VkExternalSemaphore->signal(cmd_queue2);
                 }
@@ -387,8 +415,11 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
     }
     if (program) clReleaseProgram(program);
     if (kernel_cq) clReleaseKernel(kernel_cq);
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
 
@@ -398,7 +429,7 @@ int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1,
 int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
                             cl_kernel *kernel, cl_kernel &verify_kernel,
                             VulkanDevice &vkDevice, uint32_t numBuffers,
-                            uint32_t bufferSize)
+                            uint32_t bufferSize, bool use_fence)
 {
     log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
     size_t global_work_size[1];
@@ -416,6 +447,7 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
@@ -434,10 +466,18 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
+
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
     VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
@@ -526,16 +566,26 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
 
             for (uint32_t iter = 0; iter < maxIter; iter++)
             {
-                if (iter == 0)
+                if (use_fence)
                 {
-                    vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    fence->reset();
+                    vkQueue.submit(vkCommandBuffer, fence);
+                    fence->wait();
                 }
                 else
                 {
-                    vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                   vkVk2CLSemaphore);
+                    if (iter == 0)
+                    {
+                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                    }
+                    else
+                    {
+                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                       vkVk2CLSemaphore);
+                    }
+
+                    clVk2CLExternalSemaphore->wait(cmd_queue1);
                 }
-                clVk2CLExternalSemaphore->wait(cmd_queue1);
 
                 err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t),
                                      (void *)&bufferSize);
@@ -562,7 +612,12 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
                                 " error\n");
                     goto CLEANUP;
                 }
-                if (iter != (maxIter - 1))
+                if (use_fence)
+                {
+                    clFlush(cmd_queue1);
+                    clFinish(cmd_queue1);
+                }
+                else if (!use_fence && (iter != (maxIter - 1)))
                 {
                     clCl2VkExternalSemaphore->signal(cmd_queue1);
                 }
@@ -656,8 +711,13 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
             delete externalMemory[i];
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
+
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
     return err;
@@ -666,7 +726,7 @@ int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1,
 int run_test_with_multi_import_same_ctx(
     cl_context &context, cl_command_queue &cmd_queue1, cl_kernel *kernel,
     cl_kernel &verify_kernel, VulkanDevice &vkDevice, uint32_t numBuffers,
-    uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    uint32_t bufferSize, uint32_t bufferSizeForOffset, float use_fence)
 {
     size_t global_work_size[1];
     uint8_t *error_2;
@@ -687,6 +747,7 @@ int run_test_with_multi_import_same_ctx(
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
@@ -706,10 +767,18 @@ int run_test_with_multi_import_same_ctx(
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+    }
+
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
     VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool);
@@ -832,16 +901,34 @@ int run_test_with_multi_import_same_ctx(
 
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
                     }
-                    clVk2CLExternalSemaphore->wait(cmd_queue1);
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore->wait(cmd_queue1);
+                    }
+
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
                     {
@@ -874,7 +961,11 @@ int run_test_with_multi_import_same_ctx(
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue1);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore->signal(cmd_queue1);
                     }
@@ -987,8 +1078,13 @@ int run_test_with_multi_import_same_ctx(
             }
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+    }
+
     if (error_2) free(error_2);
     if (error_1) clReleaseMemObject(error_1);
     return err;
@@ -998,7 +1094,8 @@ int run_test_with_multi_import_diff_ctx(
     cl_context &context, cl_context &context2, cl_command_queue &cmd_queue1,
     cl_command_queue &cmd_queue2, cl_kernel *kernel1, cl_kernel *kernel2,
     cl_kernel &verify_kernel, cl_kernel verify_kernel2, VulkanDevice &vkDevice,
-    uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset)
+    uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset,
+    float use_fence)
 {
     size_t global_work_size[1];
     uint8_t *error_3;
@@ -1023,6 +1120,7 @@ int run_test_with_multi_import_diff_ctx(
         getSupportedVulkanExternalSemaphoreHandleTypeList()[0];
     VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType);
     VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType);
+    std::shared_ptr<VulkanFence> fence = nullptr;
 
     VulkanQueue &vkQueue = vkDevice.getQueue();
 
@@ -1042,15 +1140,24 @@ int run_test_with_multi_import_diff_ctx(
     VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool,
                                         vkDescriptorSetLayout);
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
-        vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
-        vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-
-    clVk2CLExternalSemaphore2 = new clExternalSemaphore(
-        vkVk2CLSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore2 = new clExternalSemaphore(
-        vkCl2VkSemaphore, context2, vkExternalSemaphoreHandleType, deviceId);
+    if (use_fence)
+    {
+        fence = std::make_shared<VulkanFence>(vkDevice);
+    }
+    else
+    {
+        clVk2CLExternalSemaphore = new clExternalSemaphore(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore = new clExternalSemaphore(
+            vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
+
+        clVk2CLExternalSemaphore2 =
+            new clExternalSemaphore(vkVk2CLSemaphore, context2,
+                                    vkExternalSemaphoreHandleType, deviceId);
+        clCl2VkExternalSemaphore2 =
+            new clExternalSemaphore(vkCl2VkSemaphore, context2,
+                                    vkExternalSemaphoreHandleType, deviceId);
+    }
 
     const uint32_t maxIter = innerIterations;
     VulkanCommandPool vkCommandPool(vkDevice);
@@ -1192,16 +1299,33 @@ int run_test_with_multi_import_diff_ctx(
 
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
+                    }
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore->wait(cmd_queue1);
                     }
-                    clVk2CLExternalSemaphore->wait(cmd_queue1);
 
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
@@ -1235,7 +1359,11 @@ int run_test_with_multi_import_diff_ctx(
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue1);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore->signal(cmd_queue1);
                     }
@@ -1243,16 +1371,33 @@ int run_test_with_multi_import_diff_ctx(
                 clFinish(cmd_queue1);
                 for (uint32_t iter = 0; iter < maxIter; iter++)
                 {
-                    if (iter == 0)
+                    if (use_fence)
                     {
-                        vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        fence->reset();
+                        vkQueue.submit(vkCommandBuffer, fence);
+                        fence->wait();
                     }
                     else
                     {
-                        vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
-                                       vkVk2CLSemaphore);
+                        if (iter == 0)
+                        {
+                            vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore);
+                        }
+                        else
+                        {
+                            vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer,
+                                           vkVk2CLSemaphore);
+                        }
+                    }
+
+                    if (use_fence)
+                    {
+                        fence->wait();
+                    }
+                    else
+                    {
+                        clVk2CLExternalSemaphore2->wait(cmd_queue2);
                     }
-                    clVk2CLExternalSemaphore2->wait(cmd_queue2);
 
                     for (uint8_t launchIter = 0; launchIter < numImports;
                          launchIter++)
@@ -1286,7 +1431,11 @@ int run_test_with_multi_import_diff_ctx(
                             goto CLEANUP;
                         }
                     }
-                    if (iter != (maxIter - 1))
+                    if (use_fence)
+                    {
+                        clFinish(cmd_queue2);
+                    }
+                    else if (!use_fence && iter != (maxIter - 1))
                     {
                         clCl2VkExternalSemaphore2->signal(cmd_queue2);
                     }
@@ -1474,10 +1623,15 @@ int run_test_with_multi_import_diff_ctx(
             }
         }
     }
-    if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
-    if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
-    if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2;
-    if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2;
+
+    if (!use_fence)
+    {
+        if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore;
+        if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore;
+        if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2;
+        if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2;
+    }
+
     if (error_3) free(error_3);
     if (error_1) clReleaseMemObject(error_1);
     if (error_2) clReleaseMemObject(error_2);
@@ -1485,7 +1639,8 @@ int run_test_with_multi_import_diff_ctx(
 }
 
 int test_buffer_common(cl_device_id device_, cl_context context_,
-                       cl_command_queue queue_, int numElements_)
+                       cl_command_queue queue_, int numElements_,
+                       float use_fence)
 {
 
     int current_device = 0;
@@ -1738,26 +1893,26 @@ int test_buffer_common(cl_device_id device_, cl_context context_,
             {
                 errNum = run_test_with_multi_import_same_ctx(
                     context, cmd_queue1, kernel, verify_kernel, vkDevice,
-                    numBuffers, bufferSize, bufferSizeForOffset);
+                    numBuffers, bufferSize, bufferSizeForOffset, use_fence);
             }
             else if (multiImport && multiCtx)
             {
                 errNum = run_test_with_multi_import_diff_ctx(
                     context, context2, cmd_queue1, cmd_queue3, kernel, kernel2,
                     verify_kernel, verify_kernel2, vkDevice, numBuffers,
-                    bufferSize, bufferSizeForOffset);
+                    bufferSize, bufferSizeForOffset, use_fence);
             }
             else if (numCQ == 2)
             {
                 errNum = run_test_with_two_queue(
                     context, cmd_queue1, cmd_queue2, kernel, verify_kernel,
-                    vkDevice, numBuffers + 1, bufferSize);
+                    vkDevice, numBuffers + 1, bufferSize, use_fence);
             }
             else
             {
-                errNum = run_test_with_one_queue(context, cmd_queue1, kernel,
-                                                 verify_kernel, vkDevice,
-                                                 numBuffers, bufferSize);
+                errNum = run_test_with_one_queue(
+                    context, cmd_queue1, kernel, verify_kernel, vkDevice,
+                    numBuffers, bufferSize, use_fence);
             }
             if (errNum != CL_SUCCESS)
             {
diff --git a/test_conformance/workgroups/test_wg_all.cpp b/test_conformance/workgroups/test_wg_all.cpp
index 41abd1249..f9b574e45 100644
--- a/test_conformance/workgroups/test_wg_all.cpp
+++ b/test_conformance/workgroups/test_wg_all.cpp
@@ -75,7 +75,6 @@ test_work_group_all(cl_device_id device, cl_context context, cl_command_queue qu
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@ test_work_group_all(cl_device_id device, cl_context context, cl_command_queue qu
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_any.cpp b/test_conformance/workgroups/test_wg_any.cpp
index e0242cfb4..f7ff899a3 100644
--- a/test_conformance/workgroups/test_wg_any.cpp
+++ b/test_conformance/workgroups/test_wg_any.cpp
@@ -75,7 +75,6 @@ test_work_group_any(cl_device_id device, cl_context context, cl_command_queue qu
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -110,7 +109,7 @@ test_work_group_any(cl_device_id device, cl_context context, cl_command_queue qu
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<(num_elements+1); i++)
+    for (size_t i = 0; i < (num_elements + 1); i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index e24ac7b98..a4cb0c6fe 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -70,7 +70,7 @@ verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
 
     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
     {
-        int local_size = (n-i) > wg_size ? wg_size : (n-i);
+        size_t local_size = (n - i) > wg_size ? wg_size : (n - i);
         float broadcast_result = inptr[i + (group_id % local_size)];
         for (j=0; j<local_size; j++)
         {
@@ -172,7 +172,6 @@ test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command
     size_t       wg_size[1];
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -207,7 +206,7 @@ test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -278,7 +277,6 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -333,7 +331,7 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
@@ -402,7 +400,6 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
     size_t       num_workgroups;
     size_t       num_elements;
     int          err;
-    int          i;
     MTdata       d;
 
     err = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -458,7 +455,7 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
 
     p = input_ptr[0];
     d = init_genrand( gRandomSeed );
-    for (i=0; i<num_elements; i++)
+    for (size_t i = 0; i < num_elements; i++)
     {
         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
     }
diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
index 648e68ce1..a31fca63f 100644
--- a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -219,10 +219,8 @@ int do_test(cl_device_id device, cl_context context, cl_command_queue queue,
 int do_test_work_group_suggested_local_size(
     cl_device_id device, cl_context context, cl_command_queue queue,
     bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
-    cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim)
+    cl_ulong max_local_mem_size, size_t global_work_offset[], num_dims dim)
 {
-    clProgramWrapper scan_program;
-    clKernelWrapper scan_kernel;
     int err;
     size_t test_values[] = { 1, 1, 1 };
     std::string kernel_names[6] = {
@@ -244,6 +242,8 @@ int do_test_work_group_suggested_local_size(
     for (int kernel_num = 0; kernel_num < 6; kernel_num++)
     {
         if (max_local_mem_size < local_mem_size[kernel_num]) continue;
+        clProgramWrapper scan_program;
+        clKernelWrapper scan_kernel;
         // Create the kernel
         err = create_single_kernel_helper(
             context, &scan_program, &scan_kernel, 1,
@@ -300,7 +300,7 @@ int test_work_group_suggested_local_size_1D(cl_device_id device,
                  "Skipping the test.\n");
         return TEST_SKIPPED_ITSELF;
     }
-    cl_long max_local_mem_size;
+    cl_ulong max_local_mem_size;
     cl_int err =
         clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);