SIMD sqrt optimizations including lowp

2016-05-29 02:57:53 +02:00
parent 449c7ccedf
commit 20cf68679c
6 changed files with 68 additions and 55 deletions
--- a/glm/detail/func_exponential.inl
+++ b/glm/detail/func_exponential.inl
@@ -29,6 +29,15 @@ namespace detail
 		}
 	};

+	template <template <class, precision> class vecType, typename T, precision P>
+	struct compute_sqrt
+	{
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x)
+		{
+			return detail::functor1<T, T, P, vecType>::call(std::sqrt, x);
+		}
+	};
+
 	template <template <class, precision> class vecType, typename T, precision P>
 	struct compute_inversesqrt
 	{
@@ -113,7 +122,7 @@ namespace detail
 	GLM_FUNC_QUALIFIER vecType<T, P> sqrt(vecType<T, P> const & x)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_iec559, "'sqrt' only accept floating-point inputs");
-		return detail::functor1<T, T, P, vecType>::call(sqrt, x);
+		return detail::compute_sqrt<vecType, T, P>::call(x);
 	}

 	// inversesqrt
--- a/glm/detail/func_exponential_simd.inl
+++ b/glm/detail/func_exponential_simd.inl
@@ -1,9 +1,35 @@
 /// @ref core
 /// @file glm/detail/func_exponential_simd.inl

+#include "../simd/exponential.h"
+
+#if GLM_ARCH & GLM_ARCH_SSE2_BIT
+
 namespace glm{
 namespace detail
 {
+	template <precision P>
+	struct compute_sqrt<tvec4, float, P>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, P> call(tvec4<float, P> const & v)
+		{
+			tvec4<float, P> result(uninitialize);
+			result.data = _mm_sqrt_ps(v.data);
+			return result;
+		}
+	};

+	template <>
+	struct compute_sqrt<tvec4, float, lowp>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<float, lowp> call(tvec4<float, lowp> const & v)
+		{
+			tvec4<float, lowp> result(uninitialize);
+			result.data = glm_f32v4_sqrt_lowp(v.data);
+			return result;
+		}
+	};
 }//namespace detail
 }//namespace glm
+
+#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
--- a/glm/detail/func_geometric_simd.inl
+++ b/glm/detail/func_geometric_simd.inl
@@ -1,3 +1,6 @@
+/// @ref core
+/// @file glm/detail/func_geometric_simd.inl
+
 #include "../simd/geometric.h"

 #if GLM_ARCH & GLM_ARCH_SSE2_BIT
--- a/glm/simd/common.h
+++ b/glm/simd/common.h
@@ -178,30 +178,4 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_inf(__m128 x)
 	return _mm_castsi128_ps(_mm_cmpeq_epi32(t2, _mm_set1_epi32(0xFF000000)));		// exponent is all 1s, fraction is 0
 }

-// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration
-// By Elan Ruskin, http://assemblyrequired.crashworks.org/
-GLM_FUNC_QUALIFIER __m128 glm_f32v1_sqrt_wip(__m128 x)
-{
-	__m128 const Rcp0 = _mm_rsqrt_ss(x);  // "estimate" opcode
-	__m128 const Mul0 = _mm_mul_ss(_mm_set1_ps(0.5f), Rcp0);
-	__m128 const Mul1 = _mm_mul_ss(Rcp0, Rcp0);
-	__m128 const Mul2 = _mm_mul_ss(x, Mul1);
-	__m128 const Sub0 = _mm_sub_ss(_mm_set1_ps(3.0f), Mul2);
-	__m128 const Mul3 = _mm_mul_ss(Mul0, Sub0);
-	return Mul3;
-}
-
-// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration
-// By Elan Ruskin, http://assemblyrequired.crashworks.org/
-GLM_FUNC_QUALIFIER __m128 glm_f32v4_sqrt_wip(__m128 x)
-{
-	__m128 const Rcp0 = _mm_rsqrt_ps(x);  // "estimate" opcode
-	__m128 const Mul0 = _mm_mul_ps(_mm_set1_ps(0.5f), Rcp0);
-	__m128 const Mul1 = _mm_mul_ps(Mul0, Mul0);
-	__m128 const Mul2 = _mm_mul_ps(x, Mul1);
-	__m128 const Sub0 = _mm_sub_ps(_mm_set1_ps(3.0f), Mul2);
-	__m128 const Mul3 = _mm_mul_ps(Mul0, Sub0);
-	return Mul3;
-}
-
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
--- a/glm/simd/exponential.h
+++ b/glm/simd/exponential.h
@@ -3,3 +3,20 @@

 #pragma once

+#if GLM_ARCH & GLM_ARCH_SSE2_BIT
+
+// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration
+// By Elan Ruskin, http://assemblyrequired.crashworks.org/
+GLM_FUNC_QUALIFIER __m128 glm_f32v1_sqrt_lowp(__m128 x)
+{
+	return _mm_mul_ss(_mm_rsqrt_ss(x), x);
+}
+
+// SSE scalar reciprocal sqrt using rsqrt op, plus one Newton-Rhaphson iteration
+// By Elan Ruskin, http://assemblyrequired.crashworks.org/
+GLM_FUNC_QUALIFIER __m128 glm_f32v4_sqrt_lowp(__m128 x)
+{
+	return _mm_mul_ps(_mm_rsqrt_ps(x), x);
+}
+
+#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT