Fix problems with integer division and integer conversion from float

- The rounding mode of vcvtq_s32_f32 is round whereas vcvttq_s32_f32 (notice the extra t) is truncate. This causes issues dividing but also casting from float - In NEON, casting cannot be done by e.g. (float32x4_t) but instead needs to be explicit vcvtq_f32_s32. The division function is still technically wrong for large values but at least it's consistent now with SSE - Added new unit tests for float -> int conversions
redorav · Jul 22, 2022 · f8ebd87 · f8ebd87
1 parent 264b0c9
commit f8ebd87
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 24 deletions.
diff --git a/include/hlsl++_dependent.h b/include/hlsl++_dependent.h
@@ -18,15 +18,15 @@ namespace hlslpp
 	float3::float3(const uint3& i) hlslpp_noexcept : vec(_hlslpp_cvtepu32_ps(i.vec)) {}
 	float4::float4(const uint4& i) hlslpp_noexcept : vec(_hlslpp_cvtepu32_ps(i.vec)) {}
 
-	int1::int1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
-	int2::int2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
-	int3::int3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
-	int4::int4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epi32(f.vec)) {}
-
-	uint1::uint1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
-	uint2::uint2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
-	uint3::uint3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
-	uint4::uint4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvtps_epu32(f.vec)) {}
+	int1::int1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
+	int2::int2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
+	int3::int3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
+	int4::int4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epi32(f.vec)) {}
+
+	uint1::uint1(const float1& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
+	uint2::uint2(const float2& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
+	uint3::uint3(const float3& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
+	uint4::uint4(const float4& f) hlslpp_noexcept : vec(_hlslpp_cvttps_epu32(f.vec)) {}
 
 	hlslpp_inline float1x1::float1x1(const float2x2& m) hlslpp_noexcept
 	{

diff --git a/include/hlsl++_vector_float.h b/include/hlsl++_vector_float.h
@@ -163,7 +163,7 @@ namespace hlslpp
 		x = _hlslpp_max_ps(x, exp2_minus127);
 
 		// ipart = int(x - 0.5)
-		ipart = _hlslpp_cvtps_epi32(_hlslpp_sub_ps(x, f4_05));
+		ipart = _hlslpp_cvttps_epi32(_hlslpp_sub_ps(x, f4_05));
 
 		// fpart = x - ipart
 		fpart = _hlslpp_sub_ps(x, _hlslpp_cvtepi32_ps(ipart));

diff --git a/include/platforms/hlsl++_360.h b/include/platforms/hlsl++_360.h
@@ -281,7 +281,7 @@ hlslpp_inline void _hlslpp_load4x4_ps(float* p, n128& x0, n128& x1, n128& x2, n1
 #define _hlslpp_castps_si128(x)					((x))
 #define _hlslpp_castsi128_ps(x)					((x))
 
-#define _hlslpp_cvtps_epi32(x)					__vcfpsxws((x), 0)
+#define _hlslpp_cvttps_epi32(x)					__vcfpsxws((x), 0)
 #define _hlslpp_cvtepi32_ps(x)					__vcsxwfp((x), 0)
 
 #define _hlslpp_slli_epi32(x, y)				__vslw((x), __vset1(y))
@@ -379,7 +379,7 @@ hlslpp_inline void _hlslpp_load4_epi32(int32_t* p, n128i& x)
 #define _hlslpp_clamp_epu32(x, minx, maxx)		__vmaxuw(__vminuw((x), (maxx)), (minx))
 #define _hlslpp_sat_epu32(x)					__vmaxuw(__vminuw((x), i4_1), i4_0)
 
-#define _hlslpp_cvtps_epu32(x)					__vcfpuxws((x), 0)
+#define _hlslpp_cvttps_epu32(x)					__vcfpuxws((x), 0)
 #define _hlslpp_cvtepu32_ps(x)					__vcuxwfp((x), 0)
 
 #define _hlslpp_slli_epu32(x, y)				_hlslpp_slli_epi32((x), (y))

diff --git a/include/platforms/hlsl++_neon.h b/include/platforms/hlsl++_neon.h
@@ -281,14 +281,14 @@ hlslpp_inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
 
 #endif
 
-hlslpp_inline int32x4_t vdivq_s32(int32x4_t x, int32x4_t y)
+inline int32x4_t vdivq_s32(int32x4_t x, int32x4_t y)
 {
-	return (int32x4_t)vdivq_f32((float32x4_t)x, (float32x4_t)y);
+	return vcvtq_s32_f32(vdivq_f32(vcvtq_f32_s32(x), vcvtq_f32_s32(y)));
 }
 
 hlslpp_inline uint32x4_t vdivq_u32(uint32x4_t x, uint32x4_t y)
 {
-	return (uint32x4_t)vdivq_f32((float32x4_t)x, (float32x4_t)y);
+	return vcvtq_u32_f32(vdivq_f32(vcvtq_f32_u32(x), vcvtq_f32_u32(y)));
 }
 
 hlslpp_inline float32x4_t vrcpq_f32(float32x4_t x)
@@ -531,7 +531,7 @@ hlslpp_inline void _hlslpp_load4x4_ps(float* p, n128& x0, n128& x1, n128& x2, n1
 #define _hlslpp_castps_si128(x)					vreinterpretq_s32_f32((x))
 #define _hlslpp_castsi128_ps(x)					vreinterpretq_f32_s32((x))
 
-#define _hlslpp_cvtps_epi32(x)					vcvtq_s32_f32((x))
+#define _hlslpp_cvttps_epi32(x)					vcvtq_s32_f32((x))
 #define _hlslpp_cvtepi32_ps(x)					vcvtq_f32_s32((x))
 
 #define _hlslpp_slli_epi32(x, y)				vshlq_n_s32((x), (y))
@@ -685,7 +685,7 @@ hlslpp_inline void _hlslpp_load4_epi32(int32_t* p, n128i& x)
 #define _hlslpp_clamp_epu32(x, minx, maxx)		vmaxq_u32(vminq_u32((x), (maxx)), (minx))
 #define _hlslpp_sat_epu32(x)					vmaxq_u32(vminq_u32((x), i4_1), i4_0)
 
-#define _hlslpp_cvtps_epu32(x)					vcvtq_u32_f32((x))
+#define _hlslpp_cvttps_epu32(x)					vcvtq_u32_f32((x))
 #define _hlslpp_cvtepu32_ps(x)					vcvtq_f32_u32((x))
 
 #define _hlslpp_slli_epu32(x, y)				vshlq_n_u32((x), (y))

diff --git a/include/platforms/hlsl++_scalar.h b/include/platforms/hlsl++_scalar.h
@@ -625,7 +625,7 @@ namespace hlslpp
 		return vector_float4((float)(v.x), (float)(v.y), (float)(v.z), (float)(v.w));
 	}
 
-	hlslpp_inline vector_int4 _hlslpp_cvtps_epi32(const vector_float4& v)
+	hlslpp_inline vector_int4 _hlslpp_cvttps_epi32(const vector_float4& v)
 	{
 		return vector_int4((int32_t)(v.x), (int32_t)(v.y), (int32_t)(v.z), (int32_t)(v.w));
 	}
@@ -844,7 +844,7 @@ namespace hlslpp
 		return vector_float4((float)(v.x), (float)(v.y), (float)(v.z), (float)(v.w));
 	}
 
-	hlslpp_inline vector_uint4 _hlslpp_cvtps_epu32(const vector_float4& v)
+	hlslpp_inline vector_uint4 _hlslpp_cvttps_epu32(const vector_float4& v)
 	{
 		return vector_uint4((uint32_t)(v.x), (uint32_t)(v.y), (uint32_t)(v.z), (uint32_t)(v.w));
 	}

diff --git a/include/platforms/hlsl++_sse.h b/include/platforms/hlsl++_sse.h
@@ -564,7 +564,7 @@ hlslpp_inline n128i _hlslpp_mul_epi32(n128i x, n128i y)
 
 #endif
 
-#define _hlslpp_div_epi32(x, y)					_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(x), _mm_cvtepi32_ps(y)))
+#define _hlslpp_div_epi32(x, y)					_mm_cvttps_epi32(_mm_div_ps(_mm_cvtepi32_ps(x), _mm_cvtepi32_ps(y)))
 
 #if defined(__SSSE3__)
 #define _hlslpp_neg_epi32(x)					_mm_sign_epi32((x), _mm_set1_epi32(-1))
@@ -670,7 +670,7 @@ hlslpp_inline n128i _hlslpp_blend_epi32(n128i x, n128i y, int mask)
 #define _hlslpp_castsi128_ps(x)					_mm_castsi128_ps((x))
 
 #define _hlslpp_cvtepi32_ps(x)					_mm_cvtepi32_ps((x))
-#define _hlslpp_cvtps_epi32(x)					_mm_cvtps_epi32((x))
+#define _hlslpp_cvttps_epi32(x)					_mm_cvttps_epi32((x))
 
 #define _hlslpp_slli_epi32(x, y)				_mm_slli_epi32((x), (y))
 #define _hlslpp_srli_epi32(x, y)				_mm_srli_epi32((x), (y))
@@ -931,7 +931,7 @@ hlslpp_inline n256i _hlslpp256_or_si128(n256i x, n256i y)
 #define _hlslpp256_castsi256_ps(x)						_mm256_castsi256_ps((x))
 
 #define _hlslpp256_cvtepi32_ps(x)						_mm256_cvtepi32_ps((x))
-#define _hlslpp256_cvtps_epi32(x)						_mm256_cvtps_epi32((x))
+#define _hlslpp256_cvtps_epi32(x)						_mm256_cvttps_epi32((x))
 
 #if defined(__AVX2__)
 
@@ -1019,7 +1019,7 @@ hlslpp_inline n128i _hlslpp_min_epu32(n128u x, n128u y)
 #define _hlslpp_clamp_epu32(x, minx, maxx)		_hlslpp_max_epu32(_hlslpp_min_epu32((x), (maxx)), (minx))
 #define _hlslpp_sat_epu32(x)					_hlslpp_max_epu32(_hlslpp_min_epu32((x), i4_1), i4_0)
 
-#define _hlslpp_cvtps_epu32(x)					_hlslpp_cvtps_epi32((x))
+#define _hlslpp_cvttps_epu32(x)					_hlslpp_cvttps_epi32((x))
 #define _hlslpp_cvtepu32_ps(x)					_hlslpp_cvtepi32_ps((x))
 
 #define _hlslpp_slli_epu32(x, y)				_hlslpp_slli_epi32((x), (y))

diff --git a/premake-xbox360/xbox360.lua b/premake-xbox360/xbox360.lua
@@ -14,7 +14,7 @@ if not p.modules.xbox360 then
 	p.modules.xbox360 = {}
 
 	if _ACTION < "vs2015" then
-		configuration { "xbox360" }
+		filter { "configurations:xbox360" }
 			system "xbox360"
 	end
 

diff --git a/src/hlsl++_unit_tests_vector_int.cpp b/src/hlsl++_unit_tests_vector_int.cpp
@@ -534,4 +534,16 @@ void RunUnitTestsVectorInt()
 	vfoo2 = float2(ivfoo2);						eq(vfoo2, (float)ivfoo2.x, (float)ivfoo2.y);
 	vfoo3 = float3(ivfoo3);						eq(vfoo3, (float)ivfoo3.x, (float)ivfoo3.y, (float)ivfoo3.z);
 	vfoo4 = float4(ivfoo4);						eq(vfoo4, (float)ivfoo4.x, (float)ivfoo4.y, (float)ivfoo4.z, (float)ivfoo4.w);
+
+	// Conversion
+
+	vfoo1 = float1(f1);							eq(vfoo1, f1);
+	vfoo2 = float2(f2, f3);						eq(vfoo2, f2, f3);
+	vfoo3 = float3(f4, f5, f6);					eq(vfoo3, f4, f5, f6);
+	vfoo4 = float4(f7, f8, f9, f10);			eq(vfoo4, f7, f8, f9, f10);
+
+	int1 icfoo1 = int1( vfoo1);                 eq(icfoo1, (int32_t)vfoo1.x);
+	int2 icfoo2 = int2(-vfoo2);                 eq(icfoo2, (int32_t)-vfoo2.x, (int32_t)-vfoo2.y);
+	int3 icfoo3 = int3( vfoo3);                 eq(icfoo3, (int32_t)vfoo3.x, (int32_t)vfoo3.y, (int32_t)vfoo3.z);
+	int4 icfoo4 = int4(-vfoo4);                 eq(icfoo4, (int32_t)-vfoo4.x, (int32_t)-vfoo4.y, (int32_t)-vfoo4.z, (int32_t)-vfoo4.w);
 }