From bb47f83cf3c193fc9cd37c87b3a7795d80a49e8f Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Wed, 11 Apr 2012 21:25:47 -0400 Subject: [PATCH] Simple SSE3 skinning algorithm Future direction: do lots of work such that hsMatrix44 and hsScalarTriple are 16-byte aligned so that we can use faster aligned loads --- CMakeLists.txt | 6 + Sources/Plasma/Apps/plClient/winmain.cpp | 28 +++++ Sources/Plasma/CoreLib/hsUtils.h | 6 + .../PubUtilLib/plPipeline/plDXPipeline.cpp | 108 ++++++++++++------ 4 files changed, 115 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11abb4c3..c042e895 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,12 @@ if(MSVC) add_definitions(-D_SCL_SECURE_NO_WARNINGS) endif(MSVC) +# TODO: Maybe some kind of automated test here? +option(PLASMA_USE_SSE "Enable SSE optimizations?" ON) +if(PLASMA_USE_SSE) + add_definitions(-DHAVE_SSE) +endif(PLASMA_USE_SSE) + #TODO: Make the OpenSSL includes less promiscuous so this isn't needed include_directories(${OPENSSL_INCLUDE_DIR}) diff --git a/Sources/Plasma/Apps/plClient/winmain.cpp b/Sources/Plasma/Apps/plClient/winmain.cpp index b762daf2..cf4af644 100644 --- a/Sources/Plasma/Apps/plClient/winmain.cpp +++ b/Sources/Plasma/Apps/plClient/winmain.cpp @@ -49,6 +49,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include // Windows Load EXE into memory suff #endif +#ifdef HAVE_SSE +# include +#endif + #include #include "HeadSpin.h" @@ -1441,11 +1445,35 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except return EXCEPTION_EXECUTE_HANDLER; } +bool CheckCPU() +{ + const unsigned int sse3_flag = 0x00000001; + // (any other CPU features...) + + int cpu_info[4]; + __cpuid(cpu_info, 1); +#ifdef HAVE_SSE + if(cpu_info[2] & sse3_flag == 0) + return false; +#endif + // Insert additional feature checks here + + return true; +} + #include "pfConsoleCore/pfConsoleEngine.h" PF_CONSOLE_LINK_ALL() int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow) { + // Check to make sure we have a good CPU before getting started + if (!CheckCPU()) + { + plString msg = plString::Format("Your processor does not support all of the features required to play %S", ProductLongName()); + hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError); + return PARABLE_NORMAL_EXIT; + } + PF_CONSOLE_INIT_ALL() // Set global handle diff --git a/Sources/Plasma/CoreLib/hsUtils.h b/Sources/Plasma/CoreLib/hsUtils.h index 54e952f5..9b12e6fe 100644 --- a/Sources/Plasma/CoreLib/hsUtils.h +++ b/Sources/Plasma/CoreLib/hsUtils.h @@ -177,6 +177,12 @@ inline float hsRadiansToDegrees(float rad) { return float(rad * (180 / M_PI)); } #include #define NEWZERO(t) new(calloc(sizeof(t), 1)) t +#ifdef _MSC_VER +# define ALIGN(n) __declspec(align(n)) +#else +# define ALIGN(n) __atribute__(aligned(n)) +#endif + ///////////////////////////// // Physical memory functions ///////////////////////////// diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index d911f1a3..6c93dbbd 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -163,6 +163,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include +#ifdef HAVE_SSE +# include +#endif + //#define MF_TOSSER int mfCurrentTest = 100; @@ -10523,39 +10527,77 @@ void plDXPipeline::LoadResources() // Sorry about this, but it really did speed up the skinning. // Just some macros for the inner loop of IBlendVertsIntoBuffer. -#define MATRIXMULTBEGIN(xfm, wgt) \ - register float m00 = xfm.fMap[0][0]; \ - register float m01 = xfm.fMap[0][1]; \ - register float m02 = xfm.fMap[0][2]; \ - register float m03 = xfm.fMap[0][3]; \ - register float m10 = xfm.fMap[1][0]; \ - register float m11 = xfm.fMap[1][1]; \ - register float m12 = xfm.fMap[1][2]; \ - register float m13 = xfm.fMap[1][3]; \ - register float m20 = xfm.fMap[2][0]; \ - register float m21 = xfm.fMap[2][1]; \ - register float m22 = xfm.fMap[2][2]; \ - register float m23 = xfm.fMap[2][3]; \ - register float m_wgt = wgt; \ - register float srcX, srcY, srcZ; - -#define MATRIXMULTPOINTADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; - -#define MATRIXMULTVECTORADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; +#ifdef HAVE_SSE +# define MATRIXMULTBEGIN(xfm, wgt) \ + __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \ + ALIGN(16) float hack[4]; \ + mc0 = _mm_loadu_ps(xfm.fMap[0]); \ + mc1 = _mm_loadu_ps(xfm.fMap[1]); \ + mc2 = _mm_loadu_ps(xfm.fMap[2]); \ + mwt = _mm_set_ps1(wgt); +# define MATRIXMULTPOINTADD(dst, src) \ + msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf = _mm_hadd_ps(_x, _y); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + hbuf = _mm_hadd_ps(_z, _z); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fZ += hack[0]; +# define MATRIXMULTVECTORADD(dst, src) \ + msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf = _mm_hadd_ps(_x, _y); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + hbuf = _mm_hadd_ps(_z, _z); \ + hbuf = _mm_hadd_ps(hbuf, hbuf); \ + _mm_store_ps(hack, hbuf); \ + dst.fZ += hack[0]; +#else +# define MATRIXMULTBEGIN(xfm, wgt) \ + float m00 = xfm.fMap[0][0]; \ + float m01 = xfm.fMap[0][1]; \ + float m02 = xfm.fMap[0][2]; \ + float m03 = xfm.fMap[0][3]; \ + float m10 = xfm.fMap[1][0]; \ + float m11 = xfm.fMap[1][1]; \ + float m12 = xfm.fMap[1][2]; \ + float m13 = xfm.fMap[1][3]; \ + float m20 = xfm.fMap[2][0]; \ + float m21 = xfm.fMap[2][1]; \ + float m22 = xfm.fMap[2][2]; \ + float m23 = xfm.fMap[2][3]; \ + float m_wgt = wgt; \ + float srcX, srcY, srcZ; +# define MATRIXMULTPOINTADD(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; +# define MATRIXMULTVECTORADD(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; +#endif // HAVE_SSE // inlTESTPOINT ///////////////////////////////////////// // Update mins and maxs if destP is outside.