From bb47f83cf3c193fc9cd37c87b3a7795d80a49e8f Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Wed, 11 Apr 2012 21:25:47 -0400 Subject: [PATCH 1/2] Simple SSE3 skinning algorithm Future direction: do lots of work such that hsMatrix44 and hsScalarTriple are 16-byte aligned so that we can use faster aligned loads --- CMakeLists.txt | 6 + Sources/Plasma/Apps/plClient/winmain.cpp | 28 +++++ Sources/Plasma/CoreLib/hsUtils.h | 6 + .../PubUtilLib/plPipeline/plDXPipeline.cpp | 108 ++++++++++++------ 4 files changed, 115 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11abb4c3..c042e895 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,12 @@ if(MSVC) add_definitions(-D_SCL_SECURE_NO_WARNINGS) endif(MSVC) +# TODO: Maybe some kind of automated test here? +option(PLASMA_USE_SSE "Enable SSE optimizations?" ON) +if(PLASMA_USE_SSE) + add_definitions(-DHAVE_SSE) +endif(PLASMA_USE_SSE) + #TODO: Make the OpenSSL includes less promiscuous so this isn't needed include_directories(${OPENSSL_INCLUDE_DIR}) diff --git a/Sources/Plasma/Apps/plClient/winmain.cpp b/Sources/Plasma/Apps/plClient/winmain.cpp index b762daf2..cf4af644 100644 --- a/Sources/Plasma/Apps/plClient/winmain.cpp +++ b/Sources/Plasma/Apps/plClient/winmain.cpp @@ -49,6 +49,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include // Windows Load EXE into memory suff #endif +#ifdef HAVE_SSE +# include +#endif + #include #include "HeadSpin.h" @@ -1441,11 +1445,35 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except return EXCEPTION_EXECUTE_HANDLER; } +bool CheckCPU() +{ + const unsigned int sse3_flag = 0x00000001; + // (any other CPU features...) + + int cpu_info[4]; + __cpuid(cpu_info, 1); +#ifdef HAVE_SSE + if(cpu_info[2] & sse3_flag == 0) + return false; +#endif + // Insert additional feature checks here + + return true; +} + #include "pfConsoleCore/pfConsoleEngine.h" PF_CONSOLE_LINK_ALL() int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow) { + // Check to make sure we have a good CPU before getting started + if (!CheckCPU()) + { + plString msg = plString::Format("Your processor does not support all of the features required to play %S", ProductLongName()); + hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError); + return PARABLE_NORMAL_EXIT; + } + PF_CONSOLE_INIT_ALL() // Set global handle diff --git a/Sources/Plasma/CoreLib/hsUtils.h b/Sources/Plasma/CoreLib/hsUtils.h index 54e952f5..9b12e6fe 100644 --- a/Sources/Plasma/CoreLib/hsUtils.h +++ b/Sources/Plasma/CoreLib/hsUtils.h @@ -177,6 +177,12 @@ inline float hsRadiansToDegrees(float rad) { return float(rad * (180 / M_PI)); } #include #define NEWZERO(t) new(calloc(sizeof(t), 1)) t +#ifdef _MSC_VER +# define ALIGN(n) __declspec(align(n)) +#else +# define ALIGN(n) __atribute__(aligned(n)) +#endif + ///////////////////////////// // Physical memory functions ///////////////////////////// diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index d911f1a3..6c93dbbd 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -163,6 +163,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include +#ifdef HAVE_SSE +# include +#endif + //#define MF_TOSSER int mfCurrentTest = 100; @@ -10523,39 +10527,77 @@ void plDXPipeline::LoadResources() // Sorry about this, but it really did speed up the skinning. // Just some macros for the inner loop of IBlendVertsIntoBuffer. -#define MATRIXMULTBEGIN(xfm, wgt) \ - register float m00 = xfm.fMap[0][0]; \ - register float m01 = xfm.fMap[0][1]; \ - register float m02 = xfm.fMap[0][2]; \ - register float m03 = xfm.fMap[0][3]; \ - register float m10 = xfm.fMap[1][0]; \ - register float m11 = xfm.fMap[1][1]; \ - register float m12 = xfm.fMap[1][2]; \ - register float m13 = xfm.fMap[1][3]; \ - register float m20 = xfm.fMap[2][0]; \ - register float m21 = xfm.fMap[2][1]; \ - register float m22 = xfm.fMap[2][2]; \ - register float m23 = xfm.fMap[2][3]; \ - register float m_wgt = wgt; \ - register float srcX, srcY, srcZ; - -#define MATRIXMULTPOINTADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; - -#define MATRIXMULTVECTORADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; +#ifdef HAVE_SSE +# define MATRIXMULTBEGIN(xfm, wgt) \ + __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \ + ALIGN(16) float hack[4]; \ + mc0 = _mm_loadu_ps(xfm.fMap[0]); \ + mc1 = _mm_loadu_ps(xfm.fMap[1]); \ + mc2 = _mm_loadu_ps(xfm.fMap[2]); \ + mwt = _mm_set_ps1(wgt); +# define MATRIXMULTPOINTADD(dst, src) \ + msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf = _mm_hadd_ps(_x, _y); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + hbuf = _mm_hadd_ps(_z, _z); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fZ += hack[0]; +# define MATRIXMULTVECTORADD(dst, src) \ + msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf = _mm_hadd_ps(_x, _y); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + hbuf = _mm_hadd_ps(_z, _z); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fZ += hack[0]; +#else +# define MATRIXMULTBEGIN(xfm, wgt) \ + float m00 = xfm.fMap[0][0]; \ + float m01 = xfm.fMap[0][1]; \ + float m02 = xfm.fMap[0][2]; \ + float m03 = xfm.fMap[0][3]; \ + float m10 = xfm.fMap[1][0]; \ + float m11 = xfm.fMap[1][1]; \ + float m12 = xfm.fMap[1][2]; \ + float m13 = xfm.fMap[1][3]; \ + float m20 = xfm.fMap[2][0]; \ + float m21 = xfm.fMap[2][1]; \ + float m22 = xfm.fMap[2][2]; \ + float m23 = xfm.fMap[2][3]; \ + float m_wgt = wgt; \ + float srcX, srcY, srcZ; +# define MATRIXMULTPOINTADD(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; +# define MATRIXMULTVECTORADD(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; +#endif // HAVE_SSE // inlTESTPOINT ///////////////////////////////////////// // Update mins and maxs if destP is outside. From 062cb15b4472ccc42e7c157be225da967e6ce2b5 Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Wed, 11 Apr 2012 21:27:44 -0400 Subject: [PATCH 2/2] SSE3 hsMatrix44 multiplication --- Sources/Plasma/CoreLib/hsMatrix44.cpp | 48 +++++++++++++++++++ .../pnSceneObject/plCoordinateInterface.cpp | 4 +- .../PubUtilLib/plDrawable/plDrawableSpans.cpp | 7 +-- 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/Sources/Plasma/CoreLib/hsMatrix44.cpp b/Sources/Plasma/CoreLib/hsMatrix44.cpp index 81eba463..9c4053f3 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.cpp +++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp @@ -47,6 +47,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "hsStream.h" #include +#ifdef HAVE_SSE +# include +#endif + static hsMatrix44 myIdent = hsMatrix44().Reset(); const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } @@ -92,6 +96,18 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const rotate.QuatFromMatrix44(*this); } +#ifdef HAVE_SSE +# define MULTBEGIN(i) \ + xmm[0] = _mm_loadu_ps(fMap[i]); +# define MULTCELL(i, j) \ + xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \ + xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]); +# define MULTFINISH(i) \ + xmm[6] = _mm_hadd_ps(xmm[2], xmm[3]); \ + xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \ + xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \ + _mm_storeu_ps(c.fMap[i], xmm[1]); +#endif hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const { @@ -108,6 +124,37 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const if( b.fFlags & hsMatrix44::kIsIdent ) return *this; +#ifdef HAVE_SSE + __m128 xmm[8]; + + MULTBEGIN(0); + MULTCELL(0, 0); + MULTCELL(0, 1); + MULTCELL(0, 2); + MULTCELL(0, 3); + MULTFINISH(0); + + MULTBEGIN(1); + MULTCELL(1, 0); + MULTCELL(1, 1); + MULTCELL(1, 2); + MULTCELL(1, 3); + MULTFINISH(1); + + MULTBEGIN(2); + MULTCELL(2, 0); + MULTCELL(2, 1); + MULTCELL(2, 2); + MULTCELL(2, 3); + MULTFINISH(2); + + MULTBEGIN(3); + MULTCELL(3, 0); + MULTCELL(3, 1); + MULTCELL(3, 2); + MULTCELL(3, 3); + MULTFINISH(3); +#else c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]); c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]); c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]); @@ -127,6 +174,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]); c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]); c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]); +#endif return c; } diff --git a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp index 643834be..1c64314d 100644 --- a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp +++ b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp @@ -380,6 +380,7 @@ plProfile_CreateTimer(" CIRecalcT", "Object", CIRecalcT); plProfile_CreateTimer(" CIDirtyT", "Object", CIDirtyT); plProfile_CreateTimer(" CISetT", "Object", CISetT); +#ifndef HAVE_SSE static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) { hsMatrix44 ret; @@ -440,6 +441,7 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r return ret; } +#endif // HAVE_SSE void plCoordinateInterface::IRecalcTransforms() { @@ -447,7 +449,7 @@ void plCoordinateInterface::IRecalcTransforms() plProfile_BeginTiming(CIRecalcT); if( fParent ) { -#if 0 +#ifdef HAVE_SSE fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent; fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal(); #else diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp index a3199627..3fc19b2a 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp @@ -416,7 +416,7 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const } //// SetTransform //////////////////////////////////////////////////////////// - +#ifndef HAVE_SSE static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) { hsMatrix44 ret; @@ -477,6 +477,7 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r return ret; } +#endif #ifdef MF_TEST_UPDATE plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans); @@ -520,13 +521,13 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w #endif // MF_TEST_UPDATE for( i = 0; i < spans->GetCount(); i++ ) { -#if 0 +#ifdef HAVE_SSE fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ]; fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l; #else fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]); fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l); -#endif +#endif // HAVE_SSE } #ifdef MF_TEST_UPDATE plProfile_EndTiming(DSMatTransT);