Skip to content

Commit

Permalink
Fix problems with integer division and integer conversion from float
Browse files Browse the repository at this point in the history
- The rounding mode of vcvtq_s32_f32 is round whereas vcvttq_s32_f32 (notice the extra t) is truncate. This causes issues dividing but also casting from float
- In NEON, casting cannot be done by e.g. (float32x4_t) but instead needs to be explicit vcvtq_f32_s32. The division function is still technically wrong for large values but at least it's consistent now with SSE
- Added new unit tests for float -> int conversions
  • Loading branch information
redorav committed Jul 22, 2022
1 parent 264b0c9 commit f8ebd87
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 24 deletions.
18 changes: 9 additions & 9 deletions include/hlsl++_dependent.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ namespace hlslpp
float3::float3(const uint3& i) hlslpp_noexcept : vec(_hlslpp_cvtepu32_ps(i.vec)) {}
float4::float4(const uint4& i) hlslpp_noexcept : vec(_hlslpp_cvtepu32_ps(i.vec)) {}

int1::int1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
int2::int2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
int3::int3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
int4::int4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}

uint1::uint1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
uint2::uint2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
uint3::uint3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
uint4::uint4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
int1::int1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
int2::int2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
int3::int3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
int4::int4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}

uint1::uint1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
uint2::uint2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
uint3::uint3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
uint4::uint4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}

hlslpp_inline float1x1::float1x1(const float2x2& m) hlslpp_noexcept
{
Expand Down
2 changes: 1 addition & 1 deletion include/hlsl++_vector_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ namespace hlslpp
x = _hlslpp_max_ps(x, exp2_minus127);

// ipart = int(x - 0.5)
ipart = _hlslpp_cvtps_epi32(_hlslpp_sub_ps(x, f4_05));
ipart = _hlslpp_cvttps_epi32(_hlslpp_sub_ps(x, f4_05));

// fpart = x - ipart
fpart = _hlslpp_sub_ps(x, _hlslpp_cvtepi32_ps(ipart));
Expand Down
4 changes: 2 additions & 2 deletions include/platforms/hlsl++_360.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ hlslpp_inline void _hlslpp_load4x4_ps(float* p, n128& x0, n128& x1, n128& x2, n1
#define _hlslpp_castps_si128(x) ((x))
#define _hlslpp_castsi128_ps(x) ((x))

#define _hlslpp_cvtps_epi32(x) __vcfpsxws((x), 0)
#define _hlslpp_cvttps_epi32(x) __vcfpsxws((x), 0)
#define _hlslpp_cvtepi32_ps(x) __vcsxwfp((x), 0)

#define _hlslpp_slli_epi32(x, y) __vslw((x), __vset1(y))
Expand Down Expand Up @@ -379,7 +379,7 @@ hlslpp_inline void _hlslpp_load4_epi32(int32_t* p, n128i& x)
#define _hlslpp_clamp_epu32(x, minx, maxx) __vmaxuw(__vminuw((x), (maxx)), (minx))
#define _hlslpp_sat_epu32(x) __vmaxuw(__vminuw((x), i4_1), i4_0)

#define _hlslpp_cvtps_epu32(x) __vcfpuxws((x), 0)
#define _hlslpp_cvttps_epu32(x) __vcfpuxws((x), 0)
#define _hlslpp_cvtepu32_ps(x) __vcuxwfp((x), 0)

#define _hlslpp_slli_epu32(x, y) _hlslpp_slli_epi32((x), (y))
Expand Down
10 changes: 5 additions & 5 deletions include/platforms/hlsl++_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,14 +281,14 @@ hlslpp_inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)

#endif

hlslpp_inline int32x4_t vdivq_s32(int32x4_t x, int32x4_t y)
inline int32x4_t vdivq_s32(int32x4_t x, int32x4_t y)
{
return (int32x4_t)vdivq_f32((float32x4_t)x, (float32x4_t)y);
return vcvtq_s32_f32(vdivq_f32(vcvtq_f32_s32(x), vcvtq_f32_s32(y)));
}

hlslpp_inline uint32x4_t vdivq_u32(uint32x4_t x, uint32x4_t y)
{
return (uint32x4_t)vdivq_f32((float32x4_t)x, (float32x4_t)y);
return vcvtq_u32_f32(vdivq_f32(vcvtq_f32_u32(x), vcvtq_f32_u32(y)));
}

hlslpp_inline float32x4_t vrcpq_f32(float32x4_t x)
Expand Down Expand Up @@ -531,7 +531,7 @@ hlslpp_inline void _hlslpp_load4x4_ps(float* p, n128& x0, n128& x1, n128& x2, n1
#define _hlslpp_castps_si128(x) vreinterpretq_s32_f32((x))
#define _hlslpp_castsi128_ps(x) vreinterpretq_f32_s32((x))

#define _hlslpp_cvtps_epi32(x) vcvtq_s32_f32((x))
#define _hlslpp_cvttps_epi32(x) vcvtq_s32_f32((x))
#define _hlslpp_cvtepi32_ps(x) vcvtq_f32_s32((x))

#define _hlslpp_slli_epi32(x, y) vshlq_n_s32((x), (y))
Expand Down Expand Up @@ -685,7 +685,7 @@ hlslpp_inline void _hlslpp_load4_epi32(int32_t* p, n128i& x)
#define _hlslpp_clamp_epu32(x, minx, maxx) vmaxq_u32(vminq_u32((x), (maxx)), (minx))
#define _hlslpp_sat_epu32(x) vmaxq_u32(vminq_u32((x), i4_1), i4_0)

#define _hlslpp_cvtps_epu32(x) vcvtq_u32_f32((x))
#define _hlslpp_cvttps_epu32(x) vcvtq_u32_f32((x))
#define _hlslpp_cvtepu32_ps(x) vcvtq_f32_u32((x))

#define _hlslpp_slli_epu32(x, y) vshlq_n_u32((x), (y))
Expand Down
4 changes: 2 additions & 2 deletions include/platforms/hlsl++_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ namespace hlslpp
return vector_float4((float)(v.x), (float)(v.y), (float)(v.z), (float)(v.w));
}

hlslpp_inline vector_int4 _hlslpp_cvtps_epi32(const vector_float4& v)
hlslpp_inline vector_int4 _hlslpp_cvttps_epi32(const vector_float4& v)
{
return vector_int4((int32_t)(v.x), (int32_t)(v.y), (int32_t)(v.z), (int32_t)(v.w));
}
Expand Down Expand Up @@ -844,7 +844,7 @@ namespace hlslpp
return vector_float4((float)(v.x), (float)(v.y), (float)(v.z), (float)(v.w));
}

hlslpp_inline vector_uint4 _hlslpp_cvtps_epu32(const vector_float4& v)
hlslpp_inline vector_uint4 _hlslpp_cvttps_epu32(const vector_float4& v)
{
return vector_uint4((uint32_t)(v.x), (uint32_t)(v.y), (uint32_t)(v.z), (uint32_t)(v.w));
}
Expand Down
8 changes: 4 additions & 4 deletions include/platforms/hlsl++_sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ hlslpp_inline n128i _hlslpp_mul_epi32(n128i x, n128i y)

#endif

#define _hlslpp_div_epi32(x, y) _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(x), _mm_cvtepi32_ps(y)))
#define _hlslpp_div_epi32(x, y) _mm_cvttps_epi32(_mm_div_ps(_mm_cvtepi32_ps(x), _mm_cvtepi32_ps(y)))

#if defined(__SSSE3__)
#define _hlslpp_neg_epi32(x) _mm_sign_epi32((x), _mm_set1_epi32(-1))
Expand Down Expand Up @@ -670,7 +670,7 @@ hlslpp_inline n128i _hlslpp_blend_epi32(n128i x, n128i y, int mask)
#define _hlslpp_castsi128_ps(x) _mm_castsi128_ps((x))

#define _hlslpp_cvtepi32_ps(x) _mm_cvtepi32_ps((x))
#define _hlslpp_cvtps_epi32(x) _mm_cvtps_epi32((x))
#define _hlslpp_cvttps_epi32(x) _mm_cvttps_epi32((x))

#define _hlslpp_slli_epi32(x, y) _mm_slli_epi32((x), (y))
#define _hlslpp_srli_epi32(x, y) _mm_srli_epi32((x), (y))
Expand Down Expand Up @@ -931,7 +931,7 @@ hlslpp_inline n256i _hlslpp256_or_si128(n256i x, n256i y)
#define _hlslpp256_castsi256_ps(x) _mm256_castsi256_ps((x))

#define _hlslpp256_cvtepi32_ps(x) _mm256_cvtepi32_ps((x))
#define _hlslpp256_cvtps_epi32(x) _mm256_cvtps_epi32((x))
#define _hlslpp256_cvtps_epi32(x) _mm256_cvttps_epi32((x))

#if defined(__AVX2__)

Expand Down Expand Up @@ -1019,7 +1019,7 @@ hlslpp_inline n128i _hlslpp_min_epu32(n128u x, n128u y)
#define _hlslpp_clamp_epu32(x, minx, maxx) _hlslpp_max_epu32(_hlslpp_min_epu32((x), (maxx)), (minx))
#define _hlslpp_sat_epu32(x) _hlslpp_max_epu32(_hlslpp_min_epu32((x), i4_1), i4_0)

#define _hlslpp_cvtps_epu32(x) _hlslpp_cvtps_epi32((x))
#define _hlslpp_cvttps_epu32(x) _hlslpp_cvttps_epi32((x))
#define _hlslpp_cvtepu32_ps(x) _hlslpp_cvtepi32_ps((x))

#define _hlslpp_slli_epu32(x, y) _hlslpp_slli_epi32((x), (y))
Expand Down
2 changes: 1 addition & 1 deletion premake-xbox360/xbox360.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if not p.modules.xbox360 then
p.modules.xbox360 = {}

if _ACTION < "vs2015" then
configuration { "xbox360" }
filter { "configurations:xbox360" }
system "xbox360"
end

Expand Down
12 changes: 12 additions & 0 deletions src/hlsl++_unit_tests_vector_int.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,4 +534,16 @@ void RunUnitTestsVectorInt()
vfoo2 = float2(ivfoo2); eq(vfoo2, (float)ivfoo2.x, (float)ivfoo2.y);
vfoo3 = float3(ivfoo3); eq(vfoo3, (float)ivfoo3.x, (float)ivfoo3.y, (float)ivfoo3.z);
vfoo4 = float4(ivfoo4); eq(vfoo4, (float)ivfoo4.x, (float)ivfoo4.y, (float)ivfoo4.z, (float)ivfoo4.w);

// Conversion

vfoo1 = float1(f1); eq(vfoo1, f1);
vfoo2 = float2(f2, f3); eq(vfoo2, f2, f3);
vfoo3 = float3(f4, f5, f6); eq(vfoo3, f4, f5, f6);
vfoo4 = float4(f7, f8, f9, f10); eq(vfoo4, f7, f8, f9, f10);

int1 icfoo1 = int1( vfoo1); eq(icfoo1, (int32_t)vfoo1.x);
int2 icfoo2 = int2(-vfoo2); eq(icfoo2, (int32_t)-vfoo2.x, (int32_t)-vfoo2.y);
int3 icfoo3 = int3( vfoo3); eq(icfoo3, (int32_t)vfoo3.x, (int32_t)vfoo3.y, (int32_t)vfoo3.z);
int4 icfoo4 = int4(-vfoo4); eq(icfoo4, (int32_t)-vfoo4.x, (int32_t)-vfoo4.y, (int32_t)-vfoo4.z, (int32_t)-vfoo4.w);
}

0 comments on commit f8ebd87

Please sign in to comment.