diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index f0fca7b4f..d53911e43 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -9,8 +9,10 @@ set(${MODULE_NAME}_SOURCES binary_i_half.cpp binary_operator_double.cpp binary_operator_float.cpp + binary_operator_half.cpp binary_two_results_i_double.cpp binary_two_results_i_float.cpp + binary_two_results_i_half.cpp common.cpp common.h function_list.cpp diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index dffd7d095..4b495c953 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -27,7 +27,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -40,7 +39,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread struct ThreadInfo { @@ -58,7 +56,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -76,7 +73,6 @@ struct TestInfoBase int isNextafter; }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -92,158 +88,24 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -//////////////////////////////////////////////////////////////////////////////// -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, - bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); - test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; - test_info.isNextafter = isNextafter; - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); - } +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -254,7 +116,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) fptr func = job->f->func; int ftz = job->ftz; MTdata d = tinfo->d; - cl_uint j, k; cl_int error; const char *name = job->f->name; @@ -264,6 +125,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort *t; cl_half *r; std::vector s(0), s2(0); + cl_uint j = 0; RoundingMode oldRoundMode; cl_int copysign_test = 0; @@ -352,12 +214,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -425,24 +288,24 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) s.resize(buffer_elements); s2.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) - for (j = 0; j < buffer_elements; j++) - { - s[j] = cl_half_to_float(p[j]); - s2[j] = cl_half_to_float(p2[j]); - if (isNextafter) - r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), - CL_HALF_RTE); - else - r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); - } + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + if (isNextafter) + r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), + CL_HALF_RTE); + else + r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); + } if (isFDim && ftz) RestoreFPState(&oldMode); // Read the data back -- no need to wait for the first N-1 buffers. This is // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); if (error || NULL == out[j]) { @@ -452,21 +315,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } } - // Wait for the last buffer - out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, - 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - return error; - } - // Verify data for (j = 0; j < buffer_elements; j++) { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ushort *q = out[k]; @@ -498,8 +351,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail && ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { if (isNextafter) { @@ -549,10 +401,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { if (fail && isNextafter) { @@ -631,14 +481,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct4, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct5, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -693,10 +539,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) check_error(); // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { if (fail && isNextafter) { @@ -731,9 +575,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } if (fail) { - vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " - "%a (0x%0.4x)}\nExpected: %a (half 0x%0.4x) " - "\nActual: %a (half 0x%0.4x) at index: %d\n", + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", name, sizeNames[k], err, s[j], p[j], s2[j], p2[j], cl_half_to_float(r[j]), r[j], test, q[j], j); @@ -778,13 +622,143 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + +int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, + bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = isNextafter; + + test_info.tinfo.resize(test_info.threadCount); + + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error( + "Error: Unable to create sub-buffer of gOutBuffer[%d] " + "for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} + int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); } -//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index 571683e5d..dcfd28551 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -24,7 +24,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -37,7 +36,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -54,7 +52,6 @@ typedef struct ThreadInfo tQueue; // per thread command queue to improve performance } ThreadInfo; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -68,7 +65,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -84,168 +80,29 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// - // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static const int specialValuesInt3[] = { 0, 1, 2, 3, - 1022, 1023, 1024, INT_MIN, - INT_MAX, -1, -2, -3, - -1022, -1023, -11024, -INT_MAX }; -static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); +const int specialValuesInt3[] = { 0, 1, 2, 3, 1022, 1023, + 1024, INT_MIN, INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - cl_buffer_region region2 = { i * test_info.subBufferSize - * sizeof(cl_int), - test_info.subBufferSize * sizeof(cl_int) }; - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -348,7 +205,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half)); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -404,7 +261,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = 0; j < buffer_elements; j++) { s[j] = cl_half_to_float(p[j]); - r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE); + r[j] = HFF(func.f_fi(s[j], s2[j])); } // Read the data back -- no need to wait for the first N-1 buffers. This is @@ -450,8 +307,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail && ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -473,10 +329,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -492,9 +346,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } if (fail) { - vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " - "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a " - "(half 0x%0.4x) at index: %d\n", + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%d}\nExpected: %a (half 0x%04x) \nActual: %a " + "(half 0x%04x) at index: %d\n", name, sizeNames[k], err, s[j], p[j], s2[j], cl_half_to_float(r[j]), r[j], test, q[j], (cl_uint)j); @@ -535,3 +389,139 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } return error; } + +} // anonymous namespace + +int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp new file mode 100644 index 000000000..2d3196474 --- /dev/null +++ b/test_conformance/math_brute_force/binary_operator_half.cpp @@ -0,0 +1,663 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + // max error value. Init to 0. + float maxError; + // position of the max error value (param 1). Init to 0. + double maxErrorValue; + // position of the max error value (param 2). Init to 0. + double maxErrorValue2; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector tinfo; + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special fields +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + + const char *name = job->f->name; + cl_half *r = 0; + std::vector s(0), s2(0); + RoundingMode oldRoundMode; + + cl_event e[VECTOR_SIZE_COUNT]; + cl_half *out[VECTOR_SIZE_COUNT]; + + // start the map of the output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + bool divide = strcmp(name, "divide") == 0; + + // Init input array + cl_half *p = (cl_half *)gIn + thread_id * buffer_elements; + cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements; + cl_uint idx = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)lastSpecialJobIndex) + { + // Insert special values + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; idx < buffer_elements; idx++) + { + p[idx] = specialValuesHalf[x]; + p2[idx] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + } + + // Init any remaining values + for (; idx < buffer_elements; idx++) + { + p[idx] = (cl_half)genrand_int32(d); + p2[idx] = (cl_half)genrand_int32(d); + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xACDCACDC; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); + return error; + } + + // Run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + // Calculate the correctly rounded reference result + FPU_mode_type oldMode; + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + oldRoundMode = kRoundToNearestEven; + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + s2.resize(buffer_elements); + + for (size_t j = 0; j < buffer_elements; j++) + { + s[j] = HTF(p[j]); + s2[j] = HTF(p2[j]); + r[j] = HFF(func.f_ff(s[j], s2[j])); + } + + if (ftz) RestoreFPState(&oldMode); + + // Read the data back -- no need to wait for the first N-1 buffers but wait + // for the last buffer. This is an in order queue. + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (size_t j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = HTF(q[j]); + float correct = func.f_ff(s[j], s2[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + + float err = Ulp_Error_Half(q[j], correct); + + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(0.0, s2[j])); + correct3 = HTF(func.f_ff(-0.0, s2[j])); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + correct2 = HTF(func.f_ff(0.0, 0.0)); + correct3 = HTF(func.f_ff(-0.0, 0.0)); + correct4 = HTF(func.f_ff(0.0, -0.0)); + correct5 = HTF(func.f_ff(-0.0, -0.0)); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(s[j], 0.0)); + correct3 = HTF(func.f_ff(s[j], -0.0)); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], HTF(r[j]), r[j], test, q[j], j); + return -1; + } + } + } + } + + if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp new file mode 100644 index 000000000..3900e62d5 --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp @@ -0,0 +1,485 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ComputeReferenceInfoF +{ + const cl_half *x; + const cl_half *y; + cl_half *r; + int32_t *i; + double (*f_ffpI)(double, double, int *); + cl_uint lim; + cl_uint count; +}; + +cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const cl_half *x = cri->x + off; + const cl_half *y = cri->y + off; + cl_half *r = cri->r + off; + int32_t *i = cri->i + off; + double (*f)(double, double, int *) = cri->f_ffpI; + + if (off + count > lim) count = lim - off; + + for (cl_uint j = 0; j < count; ++j) + r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j)); + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + int64_t maxError2 = 0; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // use larger type of output data to prevent overflowing buffer size + constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t); + + cl_uint threadCount = GetThreadCount(); + + float half_ulps = f->half_ulps; + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *p = (cl_half *)gIn; + cl_half *p2 = (cl_half *)gIn2; + for (size_t j = 0; j < buffer_size; j++) + { + p[j] = (cl_half)genrand_int32(d); + p2[j] = (cl_half)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn, 0, + NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn2, + 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", + error); + return error; + } + + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", + error); + return error; + } + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(int32_t); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = p; + cri.y = p2; + cri.r = (cl_half *)gOut_Ref; + cri.i = (int32_t *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = buffer_size; + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { + cl_half *r = (cl_half *)gOut_Ref; + int32_t *r2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + r[j] = + HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_half *t = (cl_half *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = (cl_half *)(gOut[k]); + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j]) + continue; + + cl_half test = ((cl_half *)q)[j]; + int correct2 = INT_MIN; + float correct = + (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2); + float err = Ulp_Error_Half(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f) + || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= half_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + int correct3i, correct4i; + float correct3 = + (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i); + float correct4 = + (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + int correct7i, correct8i; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error_Half(test, correct3); + err3 = Ulp_Error_Half(test, correct4); + float err4 = Ulp_Error_Half(test, correct7); + float err5 = Ulp_Error_Half(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= half_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= half_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct3, half_ulps) + || IsHalfResultSubnormal(correct4, half_ulps) + || IsHalfResultSubnormal(correct7, half_ulps) + || IsHalfResultSubnormal(correct8, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + int correct3i, correct4i; + double correct3 = + f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(p[j]); + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = HTF(p[j]); + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %" PRId64 + "} ulp error at {%a, %a} " + "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, " + "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, HTF(p[j]), + HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j], + t2[j], HTF(test), q2[j], test, q2[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 67ed0d8ac..b2f3de82e 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -164,7 +164,7 @@ static constexpr vtbl _binary_operator = { "binaryOperator", TestFunc_Float_Float_Float_Operator, TestFunc_Double_Double_Double_Operator, - NULL, + TestFunc_Half_Half_Half_Operator, }; static constexpr vtbl _binary_i = { @@ -206,7 +206,7 @@ static constexpr vtbl _binary_two_results_i = { "binary_two_results_i", TestFunc_FloatI_Float_Float, TestFunc_DoubleI_Double_Double, - NULL, + TestFunc_HalfI_Half_Half, }; static constexpr vtbl _mad_tbl = { diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index c78c03a49..ada2aa89a 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -23,7 +23,8 @@ #include #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -174,7 +176,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) uint32_t err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): " + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): " "*%d vs. %d\n", f->name, sizeNames[k], err, s[j], p[j], t[j], q[j]); diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 8af034c43..6157a9ebb 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -21,10 +21,8 @@ #include - namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -37,7 +35,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// struct ThreadInfo { clMemWrapper inBuf; // input buffer for the thread @@ -48,7 +45,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -61,7 +57,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -77,139 +72,24 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -//////////////////////////////////////////////////////////////////////////////// -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - - return error; -} +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -310,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -370,7 +250,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) r[j] = (short)func.i_ff(s[j], s2[j]); } - // Read the data back -- no need to wait for the first N-1 buffers. This is // an in order queue. for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) @@ -437,8 +316,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; vlog_error( - "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " - "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n", + "\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); error = -1; return error; @@ -484,8 +363,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; - vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " - "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x " + vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x " "(index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); error = -1; @@ -526,3 +405,116 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } + +} // anonymous namespace + +int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index 755b772cd..ae359b3e5 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -23,7 +23,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread struct ThreadInfo { @@ -45,7 +43,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -57,7 +54,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -73,114 +69,7 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = - std::max((cl_uint)1, - (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -246,7 +135,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -353,7 +242,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: " + vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: " "%d vs. %d\n", name, err, s[j], p[j], t[j], q[j]); error = -1; @@ -381,7 +270,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; vlog_error("\nERROR: %s%s: %d ulp error at %a " - "(0x%0.4x)\nExpected: %d \nActual: %d\n", + "(0x%04x)\nExpected: %d \nActual: %d\n", name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); error = -1; return error; @@ -419,3 +308,105 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } return error; } + +} // anonymous namespace + +int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index ef6f2b776..5cb73d4b1 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -21,7 +21,8 @@ #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -34,7 +35,8 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -42,7 +44,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) KernelMatrix kernels; const unsigned thread_id = 0; // Test is currently not multithreaded. float maxError = 0.0f; - // int ftz = f->ftz || gForceFTZ; + float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; float maxErrorVal3 = 0.0f; @@ -96,7 +98,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp index 3739199ac..93dc612f7 100644 --- a/test_conformance/math_brute_force/ternary_half.cpp +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -41,14 +41,17 @@ cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) // A table of more difficult cases to get right static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); @@ -78,8 +81,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -294,7 +296,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) test != correct ? Ulp_Error_Half(test, ref1) : 0.f; fail = !(fabsf(err) <= half_ulps); - if (fail && (ftz || relaxedMode)) + if (fail && ftz) { // retry per section 6.5.3.2 with flushing on if (0.0f == test diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 16f57013c..16b361d53 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -87,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata, bool relaxedMode); +// half op half +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int) int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode); @@ -135,6 +138,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double, int*) int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, int*) +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode); + // Special handling for mad. // float mad(float, float, float) int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index 5b0eab4c6..f6e914c8a 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -23,7 +23,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -47,7 +45,6 @@ typedef struct ThreadInfo tQueue; // per thread command queue to improve performance } ThreadInfo; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -64,7 +61,6 @@ struct TestInfoBase float half_sin_cos_tan_limit; }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -80,147 +76,7 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = - std::max((cl_uint)1, - (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - } - - // Check for special cases for unary float - test_info.isRangeLimited = 0; - test_info.half_sin_cos_tan_limit = 0; - if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = 1.0f - + test_info.ulps - * (FLT_EPSILON / 2.0f); // out of range results from finite - // inputs must be in [-1,1] - } - else if (0 == strcmp(f->name, "half_tan")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = - INFINITY; // out of range resut from finite inputs must be numeric - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -288,7 +144,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -333,12 +189,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; - cl_ushort *t = (cl_ushort *)r; s.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) { s[j] = (float)cl_half_to_float(p[j]); - r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE); + r[j] = HFF(func.f_f(s[j])); } // Read the data back -- no need to wait for the first N-1 buffers. This is @@ -373,7 +228,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort *q = out[k]; // If we aren't getting the correctly rounded result - if (t[j] != q[j]) + if (r[j] != q[j]) { float test = cl_half_to_float(q[j]); double correct = func.f_f(s[j]); @@ -397,8 +252,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -418,10 +272,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -437,10 +289,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { vlog_error("\nERROR: %s%s: %f ulp error at %a " - "(0x%0.4x)\nExpected: %a (half 0x%0.4x) " - "\nActual: %a (half 0x%0.4x)\n", + "(half 0x%04x)\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x)\n", job->f->name, sizeNames[k], err, s[j], p[j], - cl_half_to_float(r[j]), t[j], test, q[j]); + cl_half_to_float(r[j]), r[j], test, q[j]); error = -1; return error; } @@ -480,3 +332,138 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } + +} // anonymous namespace + +int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 3f8d71168..18d4dadd0 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -62,8 +62,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float half_ulps = f->half_ulps; // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -77,22 +76,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) const unsigned m_size = 0x1ff; const unsigned e_size = 0xf; const unsigned s_size = 0x2; - const unsigned sclamp = 0xffff; for (size_t j = 0; j < half_buffer_size; j++) { unsigned ind = j % (s_size * e_size * m_size); unsigned val = (((ind / (e_size * m_size)) << 15) | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)) - & sclamp; + | (ind % m_size + 1)); pIn[j] = val; - - if (relaxedMode && strcmp(f->name, "sincos") == 0) - { - float pj = HTF(pIn[j]); - if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN - } } } @@ -106,7 +97,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) // Write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; + uint32_t pattern = 0xacdcacdc; if (gHostFill) { memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -200,7 +191,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) { // Calculate the correctly rounded reference result memset(&oldMode, 0, sizeof(oldMode)); - if (ftz || relaxedMode) ForceFTZ(&oldMode); + if (ftz) ForceFTZ(&oldMode); // Set the rounding mode to match the device if (gIsInRTZMode) @@ -218,11 +209,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) double dd; feclearexcept(FE_OVERFLOW); - if (relaxedMode) - ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); - else - ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); - + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); @@ -233,11 +220,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) for (size_t j = 0; j < half_buffer_size; j++) { double dd; - if (relaxedMode) - ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); - else - ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); - + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); } } @@ -283,17 +266,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) double fp_correct1 = 0, fp_correct2 = 0; float err = 0, err2 = 0; - if (relaxedMode) - fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2); - else - fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); + fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); cl_half correct1 = HFF(fp_correct1); cl_half correct2 = HFF(fp_correct2); // Per section 10 paragraph 6, accept any result if an input // or output is a infinity or NaN or overflow - if (relaxedMode || skipNanInf) + if (skipNanInf) { if (skipNanInf && overflow[j]) continue; // Note: no double rounding here. Reference functions @@ -304,35 +284,18 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) continue; } - // If we are in fast relaxed math, we - // have a different calculation for the - // subnormal threshold. - typedef int (*CheckForSubnormal)(double, float); - CheckForSubnormal isFloatResultSubnormalPtr; - if (relaxedMode) - { - err = Abs_Error(HTF(test1[j]), fp_correct1); - err2 = Abs_Error(HTF(test2[j]), fp_correct2); - isFloatResultSubnormalPtr = - &IsFloatResultSubnormalAbsError; - } - else - { - err = Ulp_Error_Half(test1[j], fp_correct1); - err2 = Ulp_Error_Half(test2[j], fp_correct2); - isFloatResultSubnormalPtr = &IsFloatResultSubnormal; - } + err = Ulp_Error_Half(test1[j], fp_correct1); + err2 = Ulp_Error_Half(test2[j], fp_correct2); + int fail = !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps); - if (ftz || relaxedMode) + if (ftz) { // retry per section 6.5.3.2 - if ((*isFloatResultSubnormalPtr)(fp_correct1, - half_ulps)) + if (IsHalfResultSubnormal(fp_correct1, half_ulps)) { - if ((*isFloatResultSubnormalPtr)(fp_correct2, - half_ulps)) + if (IsHalfResultSubnormal(fp_correct2, half_ulps)) { fail = fail && !(HTF(test1[j]) == 0.0f @@ -351,8 +314,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (!fail) err = 0.0f; } } - else if ((*isFloatResultSubnormalPtr)(fp_correct2, - half_ulps)) + else if (IsHalfResultSubnormal(fp_correct2, half_ulps)) { fail = fail && !(HTF(test2[j]) == 0.0f @@ -369,19 +331,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float errp, err2p, errn, err2n; if (skipNanInf) feclearexcept(FE_OVERFLOW); - if (relaxedMode) - { - fp_correctp = - f->rfunc.f_fpf(0.0, &fp_correct2p); - fp_correctn = - f->rfunc.f_fpf(-0.0, &fp_correct2n); - } - else - { - fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); - fp_correctn = - f->func.f_fpf(-0.0, &fp_correct2n); - } + fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); + fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n); cl_half correctp = HFF(fp_correctp); cl_half correctn = HFF(fp_correctn); @@ -408,20 +359,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) continue; } - if (relaxedMode) - { - errp = Abs_Error(HTF(test1[j]), fp_correctp); - err2p = Abs_Error(HTF(test1[j]), fp_correct2p); - errn = Abs_Error(HTF(test1[j]), fp_correctn); - err2n = Abs_Error(HTF(test1[j]), fp_correct2n); - } - else - { - errp = Ulp_Error_Half(test1[j], fp_correctp); - err2p = Ulp_Error_Half(test1[j], fp_correct2p); - errn = Ulp_Error_Half(test1[j], fp_correctn); - err2n = Ulp_Error_Half(test1[j], fp_correct2n); - } + errp = Ulp_Error_Half(test1[j], fp_correctp); + err2p = Ulp_Error_Half(test1[j], fp_correct2p); + errn = Ulp_Error_Half(test1[j], fp_correctn); + err2n = Ulp_Error_Half(test1[j], fp_correct2n); fail = fail && ((!(fabsf(errp) <= half_ulps)) @@ -434,15 +375,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (fabsf(err2n) < fabsf(err2)) err2 = err2n; // retry per section 6.5.3.4 - if ((*isFloatResultSubnormalPtr)(fp_correctp, - half_ulps) - || (*isFloatResultSubnormalPtr)(fp_correctn, - half_ulps)) + if (IsHalfResultSubnormal(fp_correctp, half_ulps) + || IsHalfResultSubnormal(fp_correctn, + half_ulps)) { - if ((*isFloatResultSubnormalPtr)(fp_correct2p, - half_ulps) - || (*isFloatResultSubnormalPtr)( - fp_correct2n, half_ulps)) + if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) { fail = fail && !(HTF(test1[j]) == 0.0f @@ -457,10 +397,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (!fail) err = 0.0f; } } - else if ((*isFloatResultSubnormalPtr)(fp_correct2p, - half_ulps) - || (*isFloatResultSubnormalPtr)( - fp_correct2n, half_ulps)) + else if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) { fail = fail && !(HTF(test2[j]) == 0.0f diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp index 241377dda..9a769447f 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -72,8 +72,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0; // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -88,15 +87,13 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) const unsigned m_size = 0x1ff; const unsigned e_size = 0xf; const unsigned s_size = 0x2; - const unsigned sclamp = 0xffff; for (size_t j = 0; j < half_buffer_size; j++) { unsigned ind = j % (s_size * e_size * m_size); unsigned val = (((ind / (e_size * m_size)) << 15) | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)) - & sclamp; + | (ind % m_size + 1)); pIn[j] = val; } } @@ -111,7 +108,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; + uint32_t pattern = 0xacdcacdc; if (gHostFill) { memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -161,9 +158,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Run the kernels for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // sizeof(cl_half) < sizeof (int32_t) - // to prevent overflowing gOut_Ref2 it is necessary to use - // bigger type as denominator for buffer size calculation + // align working group size with the bigger output type size_t vectorSize = sizeValues[j] * sizeof(int32_t); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; if ((error = clSetKernelArg(kernels[j][thread_id], 0, @@ -211,15 +206,17 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Read the data back for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, BUFFER_SIZE, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); return error; } if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) { vlog_error("ReadArray2 failed %d\n", error); @@ -251,10 +248,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) cl_long iErr = (int64_t)test2[j] - (int64_t)correct2; int fail = !(fabsf(err) <= half_ulps && abs_cl_long(iErr) <= maxiError); - if (ftz || relaxedMode) + if (ftz) { // retry per section 6.5.3.2 - if (IsFloatResultSubnormal(fp_correct, half_ulps)) + if (IsHalfResultSubnormal(fp_correct, half_ulps)) { fail = fail && !(test == 0.0f && iErr == 0); if (!fail) err = 0.0f; @@ -294,9 +291,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // retry per section 6.5.3.4 if (fail - && (IsFloatResultSubnormal(correct2, half_ulps) - || IsFloatResultSubnormal(fp_correct3, - half_ulps))) + && (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(fp_correct3, + half_ulps))) { fail = fail && !(test == 0.0f diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 842e85a9b..e2ff93705 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -23,7 +23,8 @@ #include #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -90,7 +92,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(gOut[j], &pattern, bufferSize); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, @@ -139,7 +141,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (!strcmp(name, "nan")) r[j] = reference_nanh(p[j]); else - r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE); + r[j] = HFF(f->func.f_u(p[j])); } // Read the data back for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) @@ -181,8 +183,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, half_ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -197,8 +198,8 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (fail) { vlog_error( - "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a " - "(0x%0.4x) \nActual: %a (0x%0.4x)\n", + "\n%s%s: %f ulp error at 0x%04x \nExpected: %a " + "(0x%04x) \nActual: %a (0x%04x)\n", f->name, sizeNames[k], err, p[j], cl_half_to_float(r[j]), r[j], test, q[j]); return -1; diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index d11ce6f36..264fc7a43 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -126,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps) return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } +inline int IsHalfResultSubnormal(float x, float ulps) +{ + x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps; + return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14); +} + inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { x = x - abs_err;