SIMD matrix functions optimizations

2016-05-30 18:01:06 +02:00
parent 84caa1092f
commit 688756b3e2
5 changed files with 90 additions and 48 deletions
--- a/glm/detail/func_matrix.inl
+++ b/glm/detail/func_matrix.inl
@@ -7,6 +7,18 @@
 namespace glm{
 namespace detail
 {
+	template <template <typename, precision> class matType, typename T, precision P>
+	struct compute_matrixCompMult
+	{
+		GLM_FUNC_QUALIFIER static matType<T, P> call(matType<T, P> const& x, matType<T, P> const& y)
+		{
+			matType<T, P> result(uninitialize);
+			for(length_t i = 0; i < result.length(); ++i)
+				result[i] = x[i] * y[i];
+			return result;
+		}
+	};
+
 	template <template <class, precision> class matType, typename T, precision P>
 	struct compute_transpose{};

@@ -347,11 +359,7 @@ namespace detail
 	GLM_FUNC_QUALIFIER matType<T, P> matrixCompMult(matType<T, P> const & x, matType<T, P> const & y)
 	{
 		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_iec559, "'matrixCompMult' only accept floating-point inputs");
-
-		matType<T, P> result(uninitialize);
-		for(length_t i = 0; i < result.length(); ++i)
-			result[i] = x[i] * y[i];
-		return result;
+		return detail::compute_matrixCompMult<matType, T, P>::call(x, y);
 	}

 	template<typename T, precision P, template <typename, precision> class vecTypeA, template <typename, precision> class vecTypeB>
--- a/glm/detail/func_matrix_simd.inl
+++ b/glm/detail/func_matrix_simd.inl
@@ -1,6 +1,8 @@
 /// @ref core
 /// @file glm/detail/func_matrix_simd.inl

+#if GLM_ARCH & GLM_ARCH_SSE2_BIT
+
 #include "type_mat4x4.hpp"
 #include "func_geometric.hpp"
 #include "../simd/matrix.h"
@@ -8,17 +10,77 @@
 namespace glm{
 namespace detail
 {
-#	if GLM_ARCH & GLM_ARCH_SSE2_BIT
-		template <precision P>
-		struct compute_inverse<tmat4x4, float, P>
+	template <precision P>
+	struct compute_matrixCompMult<tmat4x4, float, P>
+	{
+		GLM_FUNC_QUALIFIER static tmat4x4<float, P> call(tmat4x4<float, P> const & x, tmat4x4<float, P> const & y)
 		{
-			GLM_FUNC_QUALIFIER static tmat4x4<float, P> call(tmat4x4<float, P> const& m)
-			{
-				tmat4x4<float, P> Result(uninitialize);
-				glm_mat4_inverse(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data));
-				return Result;
-			}
-		};
-#	endif
+			tmat4x4<float, P> result(uninitialize);
+			glm_mat4_matrixCompMult(
+				*(glm_vec4 const (*)[4])&x[0].data,
+				*(glm_vec4 const (*)[4])&y[0].data,
+				*(glm_vec4(*)[4])&result[0].data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_transpose<tmat4x4, float, P>
+	{
+		GLM_FUNC_QUALIFIER static tmat4x4<float, P> call(tmat4x4<float, P> const & m)
+		{
+			tmat4x4<float, P> result(uninitialize);
+			glm_mat4_transpose(
+				*(glm_vec4 const (*)[4])&m[0].data,
+				*(glm_vec4(*)[4])&result[0].data);
+			return result;
+		}
+	};
+
+	template <precision P>
+	struct compute_determinant<tmat4x4, float, P>
+	{
+		GLM_FUNC_QUALIFIER static float call(tmat4x4<float, P> const& m)
+		{
+			return _mm_cvtss_f32(glm_mat4_determinant(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data)));
+		}
+	};
+
+	template <precision P>
+	struct compute_inverse<tmat4x4, float, P>
+	{
+		GLM_FUNC_QUALIFIER static tmat4x4<float, P> call(tmat4x4<float, P> const& m)
+		{
+			tmat4x4<float, P> Result(uninitialize);
+			glm_mat4_inverse(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data));
+			return Result;
+		}
+	};
 }//namespace detail
+
+	template<>
+	GLM_FUNC_QUALIFIER tmat4x4<float, lowp> outerProduct<float, lowp, tvec4, tvec4>(tvec4<float, lowp> const & c, tvec4<float, lowp> const & r)
+	{
+		tmat4x4<float, lowp> m(uninitialize);
+		glm_mat4_outerProduct(c.data, r.data, *reinterpret_cast<__m128(*)[4]>(&m[0].data));
+		return m;
+	}
+
+	template<>
+	GLM_FUNC_QUALIFIER tmat4x4<float, mediump> outerProduct<float, mediump, tvec4, tvec4>(tvec4<float, mediump> const & c, tvec4<float, mediump> const & r)
+	{
+		tmat4x4<float, mediump> m(uninitialize);
+		glm_mat4_outerProduct(c.data, r.data, *reinterpret_cast<__m128(*)[4]>(&m[0].data));
+		return m;
+	}
+
+	template<>
+	GLM_FUNC_QUALIFIER tmat4x4<float, highp> outerProduct<float, highp, tvec4, tvec4>(tvec4<float, highp> const & c, tvec4<float, highp> const & r)
+	{
+		tmat4x4<float, highp> m(uninitialize);
+		glm_mat4_outerProduct(c.data, r.data, *reinterpret_cast<__m128(*)[4]>(&m[0].data));
+		return m;
+	}
 }//namespace glm
+
+#endif
--- a/glm/gtx/simd_mat4.inl
+++ b/glm/gtx/simd_mat4.inl
@@ -563,14 +563,14 @@ GLM_FUNC_QUALIFIER detail::fmat4x4SIMD outerProduct
 GLM_FUNC_QUALIFIER detail::fmat4x4SIMD transpose(detail::fmat4x4SIMD const & m)
 {
 	detail::fmat4x4SIMD result;
-	detail::sse_transpose_ps(&m[0].Data, &result[0].Data);
+	glm_mat4_transpose(&m[0].Data, &result[0].Data);
 	return result;
 }

 GLM_FUNC_QUALIFIER float determinant(detail::fmat4x4SIMD const & m)
 {
 	float Result;
-	_mm_store_ss(&Result, detail::sse_det_ps(&m[0].Data));
+	_mm_store_ss(&Result, glm_mat4_determinant(&m[0].Data));
 	return Result;
 }

--- a/glm/simd/matrix.h
+++ b/glm/simd/matrix.h
@@ -947,7 +947,7 @@ GLM_FUNC_QUALIFIER void glm_mat4_inverse_lowp(glm_vec4 const in[4], glm_vec4 out
 	out[3] = _mm_mul_ps(Inv3, Rcp0);
 }
 /*
-GLM_FUNC_QUALIFIER void glm_f32m4_rotate(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
+GLM_FUNC_QUALIFIER void glm_mat4_rotate(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
 {
 	float a = glm::radians(Angle);
 	float c = cos(a);
@@ -1017,7 +1017,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_rotate(__m128 const in[4], float Angle, float
 	sse_mul_ps(in, Result, out);
 }
 */
-GLM_FUNC_QUALIFIER void glm_f32m4_outer(__m128 const & c, __m128 const & r, __m128 out[4])
+GLM_FUNC_QUALIFIER void glm_mat4_outerProduct(__m128 const & c, __m128 const & r, __m128 out[4])
 {
 	out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)));
 	out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)));