From 304d15acfee677d4464b33abb7e59854bfb081c1 Mon Sep 17 00:00:00 2001 From: Joseph Davies Date: Sun, 22 Apr 2012 14:51:39 -0700 Subject: [PATCH] Update hsMatrix44 to use hsCpuID dispatcher. --- Sources/Plasma/CoreLib/hsMatrix44.cpp | 85 ++++++++++++++++----------- Sources/Plasma/CoreLib/hsMatrix44.h | 11 +++- 2 files changed, 61 insertions(+), 35 deletions(-) diff --git a/Sources/Plasma/CoreLib/hsMatrix44.cpp b/Sources/Plasma/CoreLib/hsMatrix44.cpp index 9c4053f3..75bb0883 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.cpp +++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp @@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "hsStream.h" #include -#ifdef HAVE_SSE -# include +#ifdef HS_SIMD_INCLUDE +# include HS_SIMD_INCLUDE #endif static hsMatrix44 myIdent = hsMatrix44().Reset(); const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3); + /* For the rotation: ¦ 2 2 ¦ @@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const rotate.QuatFromMatrix44(*this); } -#ifdef HAVE_SSE +hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) +{ + hsMatrix44 c; + + if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent ) + { + c.Reset(); + return c; + } + + if( a.fFlags & hsMatrix44::kIsIdent ) + return b; + if( b.fFlags & hsMatrix44::kIsIdent ) + return a; + + c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]); + c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]); + c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]); + c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]); + + c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]); + c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]); + c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]); + c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]); + + c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]); + c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]); + c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]); + c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]); + + c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]); + c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]); + c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]); + c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]); + + return c; +} + +#ifdef HS_SSE3 # define MULTBEGIN(i) \ - xmm[0] = _mm_loadu_ps(fMap[i]); + xmm[0] = _mm_loadu_ps(a.fMap[i]); # define MULTCELL(i, j) \ xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \ xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]); @@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \ xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \ _mm_storeu_ps(c.fMap[i], xmm[1]); -#endif +#endif // HS_SSE3 -hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const +hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) { hsMatrix44 c; - - if( fFlags & b.fFlags & hsMatrix44::kIsIdent ) +#ifdef HS_SSE3 + if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent ) { c.Reset(); return c; } - if( fFlags & hsMatrix44::kIsIdent ) + if( a.fFlags & hsMatrix44::kIsIdent ) return b; if( b.fFlags & hsMatrix44::kIsIdent ) - return *this; + return a; -#ifdef HAVE_SSE __m128 xmm[8]; MULTBEGIN(0); @@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const MULTCELL(3, 2); MULTCELL(3, 3); MULTFINISH(3); -#else - c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]); - c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]); - c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]); - c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]); - - c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]); - c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]); - c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]); - c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]); - - c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]); - c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]); - c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]); - c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]); - - c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]); - c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]); - c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]); - c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]); -#endif - +#endif // HS_SSE3 return c; } diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h index 7804631d..c0a5ed21 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.h +++ b/Sources/Plasma/CoreLib/hsMatrix44.h @@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "HeadSpin.h" #include "hsGeometry3.h" +#include "hsCpuID.h" class hsQuat; @@ -104,7 +105,7 @@ struct hsMatrix44 { const hsVector3* up); hsBool GetParity() const; - float GetDeterminant() const; + float GetDeterminant() const; hsMatrix44* GetInverse(hsMatrix44* inverse) const; hsMatrix44* GetTranspose(hsMatrix44* inverse) const; hsMatrix44* GetAdjoint(hsMatrix44* adjoint) const; @@ -140,7 +141,7 @@ struct hsMatrix44 { return rVal; } hsVector3 operator*(const hsVector3& p) const; - hsMatrix44 operator*(const hsMatrix44& b) const; + hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); } hsPoint3* MapPoints(long count, hsPoint3 points[]) const; @@ -152,6 +153,12 @@ struct hsMatrix44 { void Read(hsStream *stream); void Write(hsStream *stream); + + // CPU-optimized functions + typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&); + static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&); + static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&); + static hsFunctionDispatcher mat_mult; }; ////////////////////////////////////////////////////////////////////////////