Simple SSE3 skinning algorithm

Future direction: do lots of work such that hsMatrix44 and hsScalarTriple are 16-byte aligned so that we can use faster aligned loads
13 years ago · bb47f83cf3
4 changed files with 115 additions and 33 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -84,6 +84,12 @@ if(MSVC)
    add_definitions(-D_SCL_SECURE_NO_WARNINGS)
 endif(MSVC)
 # TODO: Maybe some kind of automated test here?
 option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
 if(PLASMA_USE_SSE)
    add_definitions(-DHAVE_SSE)
 endif(PLASMA_USE_SSE)
 #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
 include_directories(${OPENSSL_INCLUDE_DIR})
--- a/Sources/Plasma/Apps/plClient/winmain.cpp
+++ b/Sources/Plasma/Apps/plClient/winmain.cpp
@ -49,6 +49,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
    #include <dmdfm.h>      // Windows Load EXE into memory suff
 #endif
 #ifdef HAVE_SSE
 #   include <intrin.h>
 #endif
 #include <curl/curl.h>
 #include "HeadSpin.h"
@ -1441,11 +1445,35 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
    return EXCEPTION_EXECUTE_HANDLER;
 }
 bool CheckCPU()
 {
    const unsigned int sse3_flag = 0x00000001;
    // (any other CPU features...)
    int cpu_info[4];
    __cpuid(cpu_info, 1);
 #ifdef HAVE_SSE
    if(cpu_info[2] & sse3_flag == 0)
        return false;
 #endif
    // Insert additional feature checks here
    return true;
 }
 #include "pfConsoleCore/pfConsoleEngine.h"
 PF_CONSOLE_LINK_ALL()
 int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
 {
    // Check to make sure we have a good CPU before getting started
    if (!CheckCPU())
    {
        plString msg = plString::Format("Your processor does not support all of the features required to play %S", ProductLongName());
        hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
        return PARABLE_NORMAL_EXIT;
    }
    PF_CONSOLE_INIT_ALL()
    // Set global handle
--- a/Sources/Plasma/CoreLib/hsUtils.h
+++ b/Sources/Plasma/CoreLib/hsUtils.h
@ -177,6 +177,12 @@ inline float hsRadiansToDegrees(float rad) { return float(rad * (180 / M_PI)); }
 #include <new>
 #define NEWZERO(t)              new(calloc(sizeof(t), 1)) t
 #ifdef _MSC_VER
 #   define ALIGN(n) __declspec(align(n))
 #else
 #   define ALIGN(n) __atribute__(aligned(n))
 #endif
 /////////////////////////////
 // Physical memory functions
 /////////////////////////////
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -163,6 +163,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include <algorithm>
 #ifdef HAVE_SSE
 #   include <smmintrin.h>
 #endif
 //#define MF_TOSSER
 int mfCurrentTest = 100;
@ -10523,22 +10527,60 @@ void plDXPipeline::LoadResources()
 // Sorry about this, but it really did speed up the skinning.
 // Just some macros for the inner loop of IBlendVertsIntoBuffer.
 #ifdef HAVE_SSE
 #   define MATRIXMULTBEGIN(xfm, wgt) \
-    register float m00 = xfm.fMap[0][0]; \
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
-    register float m01 = xfm.fMap[0][1]; \
+        ALIGN(16) float hack[4]; \
-    register float m02 = xfm.fMap[0][2]; \
+        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
-    register float m03 = xfm.fMap[0][3]; \
+        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
-    register float m10 = xfm.fMap[1][0]; \
+        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
-    register float m11 = xfm.fMap[1][1]; \
+        mwt = _mm_set_ps1(wgt);
-    register float m12 = xfm.fMap[1][2]; \
+#   define MATRIXMULTPOINTADD(dst, src) \
-    register float m13 = xfm.fMap[1][3]; \
+        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
-    register float m20 = xfm.fMap[2][0]; \
+        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-    register float m21 = xfm.fMap[2][1]; \
+        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-    register float m22 = xfm.fMap[2][2]; \
+        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-    register float m23 = xfm.fMap[2][3]; \
+        \
-    register float m_wgt = wgt; \
+        hbuf = _mm_hadd_ps(_x, _y); \
-    register float srcX, srcY, srcZ;
+        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-
+        _mm_store_ps(hack, hbuf); \
        dst.fX += hack[0]; \
        dst.fY += hack[1]; \
        hbuf = _mm_hadd_ps(_z, _z); \
        hbuf = _mm_hadd_ps(hbuf, hbuf); \
        _mm_store_ps(hack, hbuf); \
        dst.fZ += hack[0];
 #   define MATRIXMULTVECTORADD(dst, src) \
        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
        \
        hbuf = _mm_hadd_ps(_x, _y); \
        hbuf = _mm_hadd_ps(hbuf, hbuf); \
        _mm_store_ps(hack, hbuf); \
        dst.fX += hack[0]; \
        dst.fY += hack[1]; \
        hbuf = _mm_hadd_ps(_z, _z); \
        hbuf = _mm_hadd_ps(hbuf, hbuf); \
        _mm_store_ps(hack, hbuf); \
        dst.fZ += hack[0];
 #else
 #   define MATRIXMULTBEGIN(xfm, wgt) \
        float m00 = xfm.fMap[0][0]; \
        float m01 = xfm.fMap[0][1]; \
        float m02 = xfm.fMap[0][2]; \
        float m03 = xfm.fMap[0][3]; \
        float m10 = xfm.fMap[1][0]; \
        float m11 = xfm.fMap[1][1]; \
        float m12 = xfm.fMap[1][2]; \
        float m13 = xfm.fMap[1][3]; \
        float m20 = xfm.fMap[2][0]; \
        float m21 = xfm.fMap[2][1]; \
        float m22 = xfm.fMap[2][2]; \
        float m23 = xfm.fMap[2][3]; \
        float m_wgt = wgt; \
        float srcX, srcY, srcZ;
 #   define MATRIXMULTPOINTADD(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
@ -10547,7 +10589,6 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
 #   define MATRIXMULTVECTORADD(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
@ -10556,6 +10597,7 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
 #endif // HAVE_SSE
 // inlTESTPOINT /////////////////////////////////////////
 // Update mins and maxs if destP is outside.