diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index f0fca7b4f..d53911e43 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -9,8 +9,10 @@ set(${MODULE_NAME}_SOURCES
     binary_i_half.cpp
     binary_operator_double.cpp
     binary_operator_float.cpp
+    binary_operator_half.cpp
     binary_two_results_i_double.cpp
     binary_two_results_i_float.cpp
+    binary_two_results_i_half.cpp
     common.cpp
     common.h
     function_list.cpp
diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index dffd7d095..4b495c953 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -27,7 +27,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -40,7 +39,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 struct ThreadInfo
 {
@@ -58,7 +56,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -76,7 +73,6 @@ struct TestInfoBase
     int isNextafter;
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -92,158 +88,24 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-////////////////////////////////////////////////////////////////////////////////
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
-                                   bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
-    test_info.isNextafter = isNextafter;
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
-    }
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -254,7 +116,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     fptr func = job->f->func;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
 
@@ -264,6 +125,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ushort *t;
     cl_half *r;
     std::vector<float> s(0), s2(0);
+    cl_uint j = 0;
 
     RoundingMode oldRoundMode;
     cl_int copysign_test = 0;
@@ -352,12 +214,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
         }
 
@@ -425,24 +288,24 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     s.resize(buffer_elements);
     s2.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
-        for (j = 0; j < buffer_elements; j++)
-        {
-            s[j] = cl_half_to_float(p[j]);
-            s2[j] = cl_half_to_float(p2[j]);
-            if (isNextafter)
-                r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
-                                          CL_HALF_RTE);
-            else
-                r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
-        }
+    {
+        s[j] = cl_half_to_float(p[j]);
+        s2[j] = cl_half_to_float(p2[j]);
+        if (isNextafter)
+            r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
+                                      CL_HALF_RTE);
+        else
+            r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
+    }
 
     if (isFDim && ftz) RestoreFPState(&oldMode);
     // Read the data back -- no need to wait for the first N-1 buffers. This is
     // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ushort *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -452,21 +315,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_ushort *)clEnqueueMapBuffer(
-        tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size,
-        0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
     // Verify data
 
     for (j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ushort *q = out[k];
 
@@ -498,8 +351,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if (IsHalfSubnormal(
-                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    if (IsHalfResultSubnormal(correct, ulps))
                     {
                         if (isNextafter)
                         {
@@ -549,10 +401,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             if (fail && isNextafter)
                             {
@@ -631,14 +481,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct2, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct3, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct4, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct5, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -693,10 +539,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         check_error();
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             if (fail && isNextafter)
                             {
@@ -731,9 +575,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 }
                 if (fail)
                 {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
-                               "%a (0x%0.4x)}\nExpected: %a  (half 0x%0.4x) "
-                               "\nActual: %a (half 0x%0.4x) at index: %d\n",
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %zu\n",
                                name, sizeNames[k], err, s[j], p[j], s2[j],
                                p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
                                j);
@@ -778,13 +622,143 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     return error;
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
+                                   bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = isNextafter;
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error(
+                    "Error: Unable to create sub-buffer of gOutBuffer[%d] "
+                    "for region {%zd, %zd}\n",
+                    (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
+
 int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode);
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index 571683e5d..dcfd28551 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -24,7 +24,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -37,7 +36,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -54,7 +52,6 @@ typedef struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -68,7 +65,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -84,168 +80,29 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-static const int specialValuesInt3[] = { 0,       1,     2,      3,
-                                         1022,    1023,  1024,   INT_MIN,
-                                         INT_MAX, -1,    -2,     -3,
-                                         -1022,   -1023, -11024, -INT_MAX };
-static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
+const int specialValuesInt3[] = { 0,     1,       2,       3,       1022, 1023,
+                                  1024,  INT_MIN, INT_MAX, -1,      -2,   -3,
+                                  -1022, -1023,   -11024,  -INT_MAX };
+size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
 
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-
-    // Accumulate the arithmetic errors
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        if (test_info.tinfo[i].maxError > maxError)
-        {
-            maxError = test_info.tinfo[i].maxError;
-            maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-        }
-    }
-
-    test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -348,7 +205,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half));
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -404,7 +261,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = cl_half_to_float(p[j]);
-        r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE);
+        r[j] = HFF(func.f_fi(s[j], s2[j]));
     }
 
     // Read the data back -- no need to wait for the first N-1 buffers. This is
@@ -450,8 +307,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if (IsHalfSubnormal(
-                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    if (IsHalfResultSubnormal(correct, ulps))
                     {
                         fail = fail && (test != 0.0f);
                         if (!fail) err = 0.0f;
@@ -473,10 +329,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
@@ -492,9 +346,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 }
                 if (fail)
                 {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
-                               "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a "
-                               "(half 0x%0.4x) at index: %d\n",
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%d}\nExpected: %a (half 0x%04x) \nActual: %a "
+                               "(half 0x%04x) at index: %d\n",
                                name, sizeNames[k], err, s[j], p[j], s2[j],
                                cl_half_to_float(r[j]), r[j], test, q[j],
                                (cl_uint)j);
@@ -535,3 +389,139 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     }
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+
+    // Accumulate the arithmetic errors
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        if (test_info.tinfo[i].maxError > maxError)
+        {
+            maxError = test_info.tinfo[i].maxError;
+            maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+        }
+    }
+
+    test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
new file mode 100644
index 000000000..2d3196474
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -0,0 +1,663 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Half, ParameterType::Half,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+struct ThreadInfo
+{
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
+
+    // max error value. Init to 0.
+    float maxError;
+    // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue;
+    // position of the max error value (param 2).  Init to 0.
+    double maxErrorValue2;
+    MTdataHolder d;
+
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+};
+
+struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special fields
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_int error;
+
+    const char *name = job->f->name;
+    cl_half *r = 0;
+    std::vector<float> s(0), s2(0);
+    RoundingMode oldRoundMode;
+
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_half *out[VECTOR_SIZE_COUNT];
+
+    // start the map of the output arrays
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    bool divide = strcmp(name, "divide") == 0;
+
+    // Init input array
+    cl_half *p = (cl_half *)gIn + thread_id * buffer_elements;
+    cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
+    cl_uint idx = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
+    {
+        // Insert special values
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; idx < buffer_elements; idx++)
+        {
+            p[idx] = specialValuesHalf[x];
+            p2[idx] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+
+            if (divide)
+            {
+                cl_half pj = p[idx] & 0x7fff;
+                cl_half p2j = p2[idx] & 0x7fff;
+                // Replace values outside [2^-7, 2^7] with QNaN
+                if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+                if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+            }
+        }
+    }
+
+    // Init any remaining values
+    for (; idx < buffer_elements; idx++)
+    {
+        p[idx] = (cl_half)genrand_int32(d);
+        p2[idx] = (cl_half)genrand_int32(d);
+
+        if (divide)
+        {
+            cl_half pj = p[idx] & 0x7fff;
+            cl_half p2j = p2[idx] & 0x7fff;
+            // Replace values outside [2^-7, 2^7] with QNaN
+            if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+            if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+        }
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xACDCACDC;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
+            return error;
+        }
+
+        // Run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting)
+    {
+        return CL_SUCCESS;
+    }
+
+    // Calculate the correctly rounded reference result
+    FPU_mode_type oldMode;
+    memset(&oldMode, 0, sizeof(oldMode));
+    if (ftz) ForceFTZ(&oldMode);
+
+    // Set the rounding mode to match the device
+    oldRoundMode = kRoundToNearestEven;
+    if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        s[j] = HTF(p[j]);
+        s2[j] = HTF(p2[j]);
+        r[j] = HFF(func.f_ff(s[j], s2[j]));
+    }
+
+    if (ftz) RestoreFPState(&oldMode);
+
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Verify data
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_half *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (r[j] != q[j])
+            {
+                float test = HTF(q[j]);
+                float correct = func.f_ff(s[j], s2[j]);
+
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (!gInfNanSupport)
+                {
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                        continue;
+                }
+
+                float err = Ulp_Error_Half(q[j], correct);
+
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(0.0, s2[j]));
+                        correct3 = HTF(func.f_ff(-0.0, s2[j]));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            double correct4, correct5;
+                            float err4, err5;
+
+                            correct2 = HTF(func.f_ff(0.0, 0.0));
+                            correct3 = HTF(func.f_ff(-0.0, 0.0));
+                            correct4 = HTF(func.f_ff(0.0, -0.0));
+                            correct5 = HTF(func.f_ff(-0.0, -0.0));
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (!gInfNanSupport)
+                            {
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
+                                    continue;
+                            }
+
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(s[j], 0.0));
+                        correct3 = HTF(func.f_ff(s[j], -0.0));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %zu\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               p2[j], HTF(r[j]), r[j], test, q[j], j);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
new file mode 100644
index 000000000..3900e62d5
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
@@ -0,0 +1,485 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Int, ParameterType::Half,
+                               ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+struct ComputeReferenceInfoF
+{
+    const cl_half *x;
+    const cl_half *y;
+    cl_half *r;
+    int32_t *i;
+    double (*f_ffpI)(double, double, int *);
+    cl_uint lim;
+    cl_uint count;
+};
+
+cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const cl_half *x = cri->x + off;
+    const cl_half *y = cri->y + off;
+    cl_half *r = cri->r + off;
+    int32_t *i = cri->i + off;
+    double (*f)(double, double, int *) = cri->f_ffpI;
+
+    if (off + count > lim) count = lim - off;
+
+    for (cl_uint j = 0; j < count; ++j)
+        r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j));
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    int64_t maxError2 = 0;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    // use larger type of output data to prevent overflowing buffer size
+    constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t);
+
+    cl_uint threadCount = GetThreadCount();
+
+    float half_ulps = f->half_ulps;
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *p = (cl_half *)gIn;
+        cl_half *p2 = (cl_half *)gIn2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            p[j] = (cl_half)genrand_int32(d);
+            p2[j] = (cl_half)genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn, 0,
+                                          NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn2,
+                                          0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            // align working group size with the bigger output type
+            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = p;
+            cri.y = p2;
+            cri.r = (cl_half *)gOut_Ref;
+            cri.i = (int32_t *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = buffer_size;
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
+            cl_half *r = (cl_half *)gOut_Ref;
+            int32_t *r2 = (int32_t *)gOut_Ref2;
+            for (size_t j = 0; j < buffer_size; j++)
+                r[j] =
+                    HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j));
+        }
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        cl_half *t = (cl_half *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *q = (cl_half *)(gOut[k]);
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j])
+                    continue;
+
+                cl_half test = ((cl_half *)q)[j];
+                int correct2 = INT_MIN;
+                float correct =
+                    (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2);
+                float err = Ulp_Error_Half(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f)
+                    || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= half_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, half_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        int correct3i, correct4i;
+                        float correct3 =
+                            (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i);
+                        float correct4 =
+                            (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error_Half(test, correct3);
+                            err3 = Ulp_Error_Half(test, correct4);
+                            float err4 = Ulp_Error_Half(test, correct7);
+                            float err5 = Ulp_Error_Half(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= half_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= half_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= half_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct3, half_ulps)
+                                || IsHalfResultSubnormal(correct4, half_ulps)
+                                || IsHalfResultSubnormal(correct7, half_ulps)
+                                || IsHalfResultSubnormal(correct8, half_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = HTF(p[j]);
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = HTF(p[j]);
+                }
+
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: {%f, %" PRId64
+                               "} ulp error at {%a, %a} "
+                               "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, "
+                               "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n",
+                               f->name, sizeNames[k], err, iErr, HTF(p[j]),
+                               HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j],
+                               t2[j], HTF(test), q2[j], test, q2[j]);
+                    return -1;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 67ed0d8ac..b2f3de82e 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -164,7 +164,7 @@ static constexpr vtbl _binary_operator = {
     "binaryOperator",
     TestFunc_Float_Float_Float_Operator,
     TestFunc_Double_Double_Double_Operator,
-    NULL,
+    TestFunc_Half_Half_Half_Operator,
 };
 
 static constexpr vtbl _binary_i = {
@@ -206,7 +206,7 @@ static constexpr vtbl _binary_two_results_i = {
     "binary_two_results_i",
     TestFunc_FloatI_Float_Float,
     TestFunc_DoubleI_Double_Double,
-    NULL,
+    TestFunc_HalfI_Half_Half,
 };
 
 static constexpr vtbl _mad_tbl = {
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index c78c03a49..ada2aa89a 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -23,7 +23,8 @@
 #include <memory>
 #include <cinttypes>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -174,7 +176,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
                     uint32_t err = t[j] - q[j];
                     if (q[j] > t[j]) err = q[j] - t[j];
-                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): "
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): "
                                "*%d vs. %d\n",
                                f->name, sizeNames[k], err, s[j], p[j], t[j],
                                q[j]);
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 8af034c43..6157a9ebb 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -21,10 +21,8 @@
 
 #include <cstring>
 
-
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -37,7 +35,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 struct ThreadInfo
 {
     clMemWrapper inBuf; // input buffer for the thread
@@ -48,7 +45,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -61,7 +57,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -77,139 +72,24 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-////////////////////////////////////////////////////////////////////////////////
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-    return error;
-}
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -310,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -370,7 +250,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         r[j] = (short)func.i_ff(s[j], s2[j]);
     }
 
-
     // Read the data back -- no need to wait for the first N-1 buffers. This is
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
@@ -437,8 +316,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             short err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
             vlog_error(
-                "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
-                "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n",
+                "\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n",
                 name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j);
             error = -1;
             return error;
@@ -484,8 +363,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
                 cl_ushort err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
-                           "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x "
+                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                           "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x "
                            "(index: %d)\n",
                            name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j);
                 error = -1;
@@ -526,3 +405,116 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index 755b772cd..ae359b3e5 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -23,7 +23,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 struct ThreadInfo
 {
@@ -45,7 +43,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -57,7 +54,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -73,114 +69,7 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount =
-            std::max((cl_uint)1,
-                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -246,7 +135,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -353,7 +242,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
             short err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: "
+            vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: "
                        "%d vs. %d\n",
                        name, err, s[j], p[j], t[j], q[j]);
             error = -1;
@@ -381,7 +270,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 short err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
                 vlog_error("\nERROR: %s%s: %d ulp error at %a "
-                           "(0x%0.4x)\nExpected: %d \nActual: %d\n",
+                           "(0x%04x)\nExpected: %d \nActual: %d\n",
                            name, sizeNames[k], err, s[j], p[j], -t[j], q[j]);
                 error = -1;
                 return error;
@@ -419,3 +308,105 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     }
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index ef6f2b776..5cb73d4b1 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -21,7 +21,8 @@
 
 #include <cstring>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -34,7 +35,8 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -42,7 +44,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
     KernelMatrix kernels;
     const unsigned thread_id = 0; // Test is currently not multithreaded.
     float maxError = 0.0f;
-    //    int ftz = f->ftz || gForceFTZ;
+
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
@@ -96,7 +98,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint16_t pattern = 0xdead;
+            uint32_t pattern = 0xACDCACDC;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
index 3739199ac..93dc612f7 100644
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -41,14 +41,17 @@ cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 // A table of more difficult cases to get right
 static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
 constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
@@ -78,8 +81,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -294,7 +296,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
                         test != correct ? Ulp_Error_Half(test, ref1) : 0.f;
                     fail = !(fabsf(err) <= half_ulps);
 
-                    if (fail && (ftz || relaxedMode))
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2  with flushing on
                         if (0.0f == test
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
index 16f57013c..16b361d53 100644
--- a/test_conformance/math_brute_force/test_functions.h
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -87,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata,
                                            bool relaxedMode);
 
+// half op half
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, int)
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
 
@@ -135,6 +138,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 // double foo(double, double, int*)
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, half, int*)
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode);
+
 // Special handling for mad.
 // float mad(float, float, float)
 int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index 5b0eab4c6..f6e914c8a 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -23,7 +23,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -47,7 +45,6 @@ typedef struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -64,7 +61,6 @@ struct TestInfoBase
     float half_sin_cos_tan_limit;
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -80,147 +76,7 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount =
-            std::max((cl_uint)1,
-                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-    }
-
-    // Check for special cases for unary float
-    test_info.isRangeLimited = 0;
-    test_info.half_sin_cos_tan_limit = 0;
-    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f
-            + test_info.ulps
-                * (FLT_EPSILON / 2.0f); // out of range results from finite
-                                        // inputs must be in [-1,1]
-    }
-    else if (0 == strcmp(f->name, "half_tan"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit =
-            INFINITY; // out of range resut from finite inputs must be numeric
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -288,7 +144,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -333,12 +189,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Calculate the correctly rounded reference result
     cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
-    cl_ushort *t = (cl_ushort *)r;
     s.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = (float)cl_half_to_float(p[j]);
-        r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE);
+        r[j] = HFF(func.f_f(s[j]));
     }
 
     // Read the data back -- no need to wait for the first N-1 buffers. This is
@@ -373,7 +228,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             cl_ushort *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
+            if (r[j] != q[j])
             {
                 float test = cl_half_to_float(q[j]);
                 double correct = func.f_f(s[j]);
@@ -397,8 +252,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                     if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct, ulps))
                         {
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
@@ -418,10 +272,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct2, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct3, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -437,10 +289,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail)
                 {
                     vlog_error("\nERROR: %s%s: %f ulp error at %a "
-                               "(0x%0.4x)\nExpected: %a (half 0x%0.4x) "
-                               "\nActual: %a (half 0x%0.4x)\n",
+                               "(half 0x%04x)\nExpected: %a (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x)\n",
                                job->f->name, sizeNames[k], err, s[j], p[j],
-                               cl_half_to_float(r[j]), t[j], test, q[j]);
+                               cl_half_to_float(r[j]), r[j], test, q[j]);
                     error = -1;
                     return error;
                 }
@@ -480,3 +332,138 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 3f8d71168..18d4dadd0 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -62,8 +62,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
     float half_ulps = f->half_ulps;
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -77,22 +76,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             const unsigned m_size = 0x1ff;
             const unsigned e_size = 0xf;
             const unsigned s_size = 0x2;
-            const unsigned sclamp = 0xffff;
 
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 unsigned ind = j % (s_size * e_size * m_size);
                 unsigned val = (((ind / (e_size * m_size)) << 15)
                                 | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1))
-                    & sclamp;
+                                | (ind % m_size + 1));
                 pIn[j] = val;
-
-                if (relaxedMode && strcmp(f->name, "sincos") == 0)
-                {
-                    float pj = HTF(pIn[j]);
-                    if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN
-                }
             }
         }
 
@@ -106,7 +97,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         // Write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
+            uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
                 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -200,7 +191,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         {
             // Calculate the correctly rounded reference result
             memset(&oldMode, 0, sizeof(oldMode));
-            if (ftz || relaxedMode) ForceFTZ(&oldMode);
+            if (ftz) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
             if (gIsInRTZMode)
@@ -218,11 +209,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                 double dd;
                 feclearexcept(FE_OVERFLOW);
 
-                if (relaxedMode)
-                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
-                else
-                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
-
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
                 overflow[j] =
                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
@@ -233,11 +220,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 double dd;
-                if (relaxedMode)
-                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
-                else
-                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
-
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
             }
         }
@@ -283,17 +266,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                     double fp_correct1 = 0, fp_correct2 = 0;
                     float err = 0, err2 = 0;
 
-                    if (relaxedMode)
-                        fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2);
-                    else
-                        fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
+                    fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
 
                     cl_half correct1 = HFF(fp_correct1);
                     cl_half correct2 = HFF(fp_correct2);
 
                     // Per section 10 paragraph 6, accept any result if an input
                     // or output is a infinity or NaN or overflow
-                    if (relaxedMode || skipNanInf)
+                    if (skipNanInf)
                     {
                         if (skipNanInf && overflow[j]) continue;
                         // Note: no double rounding here.  Reference functions
@@ -304,35 +284,18 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             continue;
                     }
 
-                    // If we are in fast relaxed math, we
-                    // have a different calculation for the
-                    // subnormal threshold.
-                    typedef int (*CheckForSubnormal)(double, float);
-                    CheckForSubnormal isFloatResultSubnormalPtr;
-                    if (relaxedMode)
-                    {
-                        err = Abs_Error(HTF(test1[j]), fp_correct1);
-                        err2 = Abs_Error(HTF(test2[j]), fp_correct2);
-                        isFloatResultSubnormalPtr =
-                            &IsFloatResultSubnormalAbsError;
-                    }
-                    else
-                    {
-                        err = Ulp_Error_Half(test1[j], fp_correct1);
-                        err2 = Ulp_Error_Half(test2[j], fp_correct2);
-                        isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
-                    }
+                    err = Ulp_Error_Half(test1[j], fp_correct1);
+                    err2 = Ulp_Error_Half(test2[j], fp_correct2);
+
                     int fail =
                         !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps);
 
-                    if (ftz || relaxedMode)
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if ((*isFloatResultSubnormalPtr)(fp_correct1,
-                                                         half_ulps))
+                        if (IsHalfResultSubnormal(fp_correct1, half_ulps))
                         {
-                            if ((*isFloatResultSubnormalPtr)(fp_correct2,
-                                                             half_ulps))
+                            if (IsHalfResultSubnormal(fp_correct2, half_ulps))
                             {
                                 fail = fail
                                     && !(HTF(test1[j]) == 0.0f
@@ -351,8 +314,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                 if (!fail) err = 0.0f;
                             }
                         }
-                        else if ((*isFloatResultSubnormalPtr)(fp_correct2,
-                                                              half_ulps))
+                        else if (IsHalfResultSubnormal(fp_correct2, half_ulps))
                         {
                             fail = fail
                                 && !(HTF(test2[j]) == 0.0f
@@ -369,19 +331,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             float errp, err2p, errn, err2n;
 
                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
-                            if (relaxedMode)
-                            {
-                                fp_correctp =
-                                    f->rfunc.f_fpf(0.0, &fp_correct2p);
-                                fp_correctn =
-                                    f->rfunc.f_fpf(-0.0, &fp_correct2n);
-                            }
-                            else
-                            {
-                                fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
-                                fp_correctn =
-                                    f->func.f_fpf(-0.0, &fp_correct2n);
-                            }
+                            fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
+                            fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n);
 
                             cl_half correctp = HFF(fp_correctp);
                             cl_half correctn = HFF(fp_correctn);
@@ -408,20 +359,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                     continue;
                             }
 
-                            if (relaxedMode)
-                            {
-                                errp = Abs_Error(HTF(test1[j]), fp_correctp);
-                                err2p = Abs_Error(HTF(test1[j]), fp_correct2p);
-                                errn = Abs_Error(HTF(test1[j]), fp_correctn);
-                                err2n = Abs_Error(HTF(test1[j]), fp_correct2n);
-                            }
-                            else
-                            {
-                                errp = Ulp_Error_Half(test1[j], fp_correctp);
-                                err2p = Ulp_Error_Half(test1[j], fp_correct2p);
-                                errn = Ulp_Error_Half(test1[j], fp_correctn);
-                                err2n = Ulp_Error_Half(test1[j], fp_correct2n);
-                            }
+                            errp = Ulp_Error_Half(test1[j], fp_correctp);
+                            err2p = Ulp_Error_Half(test1[j], fp_correct2p);
+                            errn = Ulp_Error_Half(test1[j], fp_correctn);
+                            err2n = Ulp_Error_Half(test1[j], fp_correct2n);
 
                             fail = fail
                                 && ((!(fabsf(errp) <= half_ulps))
@@ -434,15 +375,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if ((*isFloatResultSubnormalPtr)(fp_correctp,
-                                                             half_ulps)
-                                || (*isFloatResultSubnormalPtr)(fp_correctn,
-                                                                half_ulps))
+                            if (IsHalfResultSubnormal(fp_correctp, half_ulps)
+                                || IsHalfResultSubnormal(fp_correctn,
+                                                         half_ulps))
                             {
-                                if ((*isFloatResultSubnormalPtr)(fp_correct2p,
-                                                                 half_ulps)
-                                    || (*isFloatResultSubnormalPtr)(
-                                        fp_correct2n, half_ulps))
+                                if (IsHalfResultSubnormal(fp_correct2p,
+                                                          half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct2n,
+                                                             half_ulps))
                                 {
                                     fail = fail
                                         && !(HTF(test1[j]) == 0.0f
@@ -457,10 +397,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                     if (!fail) err = 0.0f;
                                 }
                             }
-                            else if ((*isFloatResultSubnormalPtr)(fp_correct2p,
-                                                                  half_ulps)
-                                     || (*isFloatResultSubnormalPtr)(
-                                         fp_correct2n, half_ulps))
+                            else if (IsHalfResultSubnormal(fp_correct2p,
+                                                           half_ulps)
+                                     || IsHalfResultSubnormal(fp_correct2n,
+                                                              half_ulps))
                             {
                                 fail = fail
                                     && !(HTF(test2[j]) == 0.0f
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
index 241377dda..9a769447f 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -72,8 +72,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
     maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0;
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -88,15 +87,13 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             const unsigned m_size = 0x1ff;
             const unsigned e_size = 0xf;
             const unsigned s_size = 0x2;
-            const unsigned sclamp = 0xffff;
 
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 unsigned ind = j % (s_size * e_size * m_size);
                 unsigned val = (((ind / (e_size * m_size)) << 15)
                                 | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1))
-                    & sclamp;
+                                | (ind % m_size + 1));
                 pIn[j] = val;
             }
         }
@@ -111,7 +108,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
+            uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
                 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -161,9 +158,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            // sizeof(cl_half) < sizeof (int32_t)
-            // to prevent overflowing gOut_Ref2 it is necessary to use
-            // bigger type as denominator for buffer size calculation
+            // align working group size with the bigger output type
             size_t vectorSize = sizeValues[j] * sizeof(int32_t);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j][thread_id], 0,
@@ -211,15 +206,17 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Read the data back
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
             if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 return error;
             }
             if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
                                          BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
@@ -251,10 +248,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
                     cl_long iErr = (int64_t)test2[j] - (int64_t)correct2;
                     int fail = !(fabsf(err) <= half_ulps
                                  && abs_cl_long(iErr) <= maxiError);
-                    if (ftz || relaxedMode)
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if (IsFloatResultSubnormal(fp_correct, half_ulps))
+                        if (IsHalfResultSubnormal(fp_correct, half_ulps))
                         {
                             fail = fail && !(test == 0.0f && iErr == 0);
                             if (!fail) err = 0.0f;
@@ -294,9 +291,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
 
                             // retry per section 6.5.3.4
                             if (fail
-                                && (IsFloatResultSubnormal(correct2, half_ulps)
-                                    || IsFloatResultSubnormal(fp_correct3,
-                                                              half_ulps)))
+                                && (IsHalfResultSubnormal(correct2, half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct3,
+                                                             half_ulps)))
                             {
                                 fail = fail
                                     && !(test == 0.0f
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 842e85a9b..e2ff93705 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -23,7 +23,8 @@
 #include <cstring>
 #include <cinttypes>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -90,7 +92,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint16_t pattern = 0xdead;
+            uint32_t pattern = 0xACDCACDC;
             memset_pattern4(gOut[j], &pattern, bufferSize);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
@@ -139,7 +141,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
             if (!strcmp(name, "nan"))
                 r[j] = reference_nanh(p[j]);
             else
-                r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE);
+                r[j] = HFF(f->func.f_u(p[j]));
         }
         // Read the data back
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -181,8 +183,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                         if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct, half_ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -197,8 +198,8 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                     if (fail)
                     {
                         vlog_error(
-                            "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a "
-                            "(0x%0.4x) \nActual: %a (0x%0.4x)\n",
+                            "\n%s%s: %f ulp error at 0x%04x \nExpected: %a "
+                            "(0x%04x) \nActual: %a (0x%04x)\n",
                             f->name, sizeNames[k], err, p[j],
                             cl_half_to_float(r[j]), r[j], test, q[j]);
                         return -1;
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index d11ce6f36..264fc7a43 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -126,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps)
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
+inline int IsHalfResultSubnormal(float x, float ulps)
+{
+    x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps;
+    return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14);
+}
+
 inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
     x = x - abs_err;