Merge pull request #332 from Hoikas/simd

More Skinning Improvements
12 years ago · 0008c55fc8
7 changed files with 201 additions and 61 deletions
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@ -74,6 +74,7 @@ endif(UNIX)
 set(CoreLib_HEADERS
    HeadSpin.h
    hsAlignedAllocator.hpp
    hsBiExpander.h
    hsBitVector.h
    hsBounds.h
--- a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
+++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
@ -0,0 +1,129 @@
 /*==LICENSE==*
 CyanWorlds.com Engine - MMOG client, server and tools
 Copyright (C) 2011  Cyan Worlds, Inc.
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 Additional permissions under GNU GPL version 3 section 7
 If you modify this Program, or any covered work, by linking or
 combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
 NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
 JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
 (or a modified version of those libraries),
 containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
 PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
 JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
 licensors of this Program grant you additional
 permission to convey the resulting work. Corresponding Source for a
 non-source form of such a combination shall include the source code for
 the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
 work.
 You can contact Cyan Worlds, Inc. by email legal@cyan.com
 or by snail mail at:
      Cyan Worlds, Inc.
      14617 N Newport Hwy
      Mead, WA   99021
 *==LICENSE==*/
 #ifndef _HS_ALIGNED_ALLOCATOR_H
 #define _HS_ALIGNED_ALLOCATOR_H
 #include "HeadSpin.h"
 template<class T, size_t ALIGNMENT=16>
 /**
 * An aligned allocator for storing SIMD ready values in STL containers
 * \remarks Based on https://gist.github.com/donny-dont/1471329
 */
 class hsAlignedAllocator
 {
    hsAlignedAllocator& operator=(const hsAlignedAllocator&) { }
 public:
    template <typename U, size_t ALIGNMENT=16>
    struct rebind
    {
        typedef hsAlignedAllocator<U, ALIGNMENT> other;
    };
    typedef T* pointer;
    typedef const T* const_pointer;
    typedef T& reference;
    typedef const T& const_reference;
    typedef T value_type;
    typedef size_t size_type;
    typedef ptrdiff_t difference_type;
    hsAlignedAllocator() { }
    hsAlignedAllocator(const hsAlignedAllocator&) { }
    template <typename U> hsAlignedAllocator(const hsAlignedAllocator<U, ALIGNMENT>&) { }
    ~hsAlignedAllocator() { }
    pointer address(reference r) const { return &r; }
    const_pointer address(const_reference r) const { return &r; }
    pointer allocate(size_type size, const_pointer hint=nullptr)
    {
        if (size == 0)
            return nullptr;
        if (size > max_size())
            throw std::length_error("integer overflow");
 #ifdef HS_BUILD_FOR_WIN32
        void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT);
 #else
        void* ptr = nullptr;
        posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type));
 #endif // HS_BUILD_FOR_WIN32
        if (!ptr)
            throw std::bad_alloc();
        return static_cast<pointer>(ptr);
    }
    void construct(T* const p, const_reference t) const
    {
        void * const pv = static_cast<void *>(p);
        new (pv) value_type(t);
    }
    void deallocate(pointer ptr, size_type size)
    {
 #ifdef HS_BUILD_FOR_WIN32
        _aligned_free(ptr);
 #else
        free(ptr);
 #endif // HS_BUILD_FOR_WIN32
    }
    void destroy(T* const p) const
    {
        p->~T();
    }
    size_type max_size() const
    {
        return static_cast<size_t>(-1) / sizeof(value_type);
    }
    bool operator==(const hsAlignedAllocator& other) const { return true; }
 };
 #endif // _HS_ALIGNED_ALLOCATOR_H
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@ -61,7 +61,11 @@ struct hsMatrix44 {
        kView
    };
    float            fMap[4][4];
-    uint32_t         fFlags;
+    union
    {
        uint8_t      alignment[16];
        uint32_t     fFlags;
    };
    hsMatrix44() : fFlags(0) {}
    hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate);
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@ -1045,10 +1045,10 @@ void    plDrawableSpans::Read( hsStream* s, hsResMgr* mgr )
    /// Read in the matrix palette (if any)
    count = s->ReadLE32();
-    fLocalToWorlds.SetCount(count);
+    fLocalToWorlds.resize(count);
-    fWorldToLocals.SetCount(count);
+    fWorldToLocals.resize(count);
-    fLocalToBones.SetCount(count);
+    fLocalToBones.resize(count);
-    fBoneToLocals.SetCount(count);
+    fBoneToLocals.resize(count);
    for( i = 0; i < count; i++ )
    {
        fLocalToWorlds[i].Read(s);
@ -2209,16 +2209,12 @@ uint32_t  plDrawableSpans::AppendDIMatrixSpans(int n)
    if( fNeedCleanup )
        IRemoveGarbage();
-    uint32_t baseIdx = fLocalToWorlds.GetCount();
+    uint32_t baseIdx = fLocalToWorlds.size();
-    fLocalToWorlds.Expand(baseIdx + n);
+    fLocalToWorlds.resize(baseIdx + n);
-    fLocalToWorlds.SetCount(baseIdx + n);
+    fWorldToLocals.resize(baseIdx + n);
    fWorldToLocals.Expand(baseIdx + n);
    fWorldToLocals.SetCount(baseIdx + n);
-    fLocalToBones.Expand(baseIdx + n);
+    fLocalToBones.resize(baseIdx + n);
-    fLocalToBones.SetCount(baseIdx + n);
+    fBoneToLocals.resize(baseIdx + n);
    fBoneToLocals.Expand(baseIdx + n);
    fBoneToLocals.SetCount(baseIdx + n);
    int i;
    for( i = baseIdx; i < baseIdx + n; i++ )
@ -2267,7 +2263,7 @@ uint32_t plDrawableSpans::FindBoneBaseMatrix(const hsTArray<hsMatrix44>& initL2B
        // runtime, a sharable bone pallete won't be found by scanning fSpans.
        // We have to do a larger search through all bone matrices.
        int i;
-        for( i = 0; i + initL2B.GetCount() < fLocalToBones.GetCount(); i++ )
+        for( i = 0; i + initL2B.GetCount() < fLocalToBones.size(); i++ )
        {
            int j;
            for( j = 0; j < initL2B.GetCount(); j++ )
@ -2894,7 +2890,7 @@ void    plDrawableSpans::ICleanupMatrices()
        }
    }
-    for( j = 0; j < fLocalToWorlds.GetCount(); j++ )
+    for( j = 0; j < fLocalToWorlds.size(); j++ )
    {
        if( !usedMatrices.IsBitSet(j) )
        {
@ -2910,7 +2906,7 @@ void    plDrawableSpans::ICleanupMatrices()
                    }
                }
            }
-            for( i = j+1; i < fLocalToWorlds.GetCount(); i++ )
+            for( i = j+1; i < fLocalToWorlds.size(); i++ )
            {
                fLocalToWorlds[i] = fLocalToWorlds[i-1];
                fWorldToLocals[i] = fWorldToLocals[i-1];
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
@ -63,13 +63,14 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #ifndef _plDrawableSpans_h
 #define _plDrawableSpans_h
-
+#include "hsAlignedAllocator.hpp"
 #include "hsBitVector.h"
 #include "hsTemplates.h"
 #include "plDrawable.h"
 #include "hsBounds.h"
 #include "hsMatrix44.h"
 #include "plSpanTypes.h"
 #include <vector>
 class plPipeline;
 class plMessage;
@ -131,11 +132,11 @@ class plDrawableSpans : public plDrawable
        hsMatrix44          fLocalToWorld;
        hsMatrix44          fWorldToLocal;
-        hsTArray<hsMatrix44>    fLocalToWorlds;
+        std::vector<hsMatrix44, hsAlignedAllocator<hsMatrix44>> fLocalToWorlds; // used in SIMD skinning
-        hsTArray<hsMatrix44>    fWorldToLocals;
+        std::vector<hsMatrix44> fWorldToLocals;
-        hsTArray<hsMatrix44>    fLocalToBones;
+        std::vector<hsMatrix44> fLocalToBones;
-        hsTArray<hsMatrix44>    fBoneToLocals;
+        std::vector<hsMatrix44> fBoneToLocals;
        hsTArray<hsGMaterial *> fMaterials;
@ -283,7 +284,7 @@ class plDrawableSpans : public plDrawable
        virtual uint32_t                     GetNumSpans( void ) const { return fSpans.GetCount(); }
        virtual const hsTArray<plSpan *>    &GetSpanArray( void ) const { return fSpans; }
-        hsMatrix44* GetMatrixPalette(int baseMatrix) const { return &fLocalToWorlds[baseMatrix]; }
+        hsMatrix44* GetMatrixPalette(int baseMatrix) const { return const_cast<hsMatrix44*>(&fLocalToWorlds[baseMatrix]); }
        const hsMatrix44& GetPaletteMatrix(int i) const { return fLocalToWorlds[i]; }
        void SetInitialBone(int i, const hsMatrix44& l2b, const hsMatrix44& b2l);
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
@ -163,7 +163,7 @@ void    plDrawableSpans::Write( hsStream* s, hsResMgr* mgr )
            fSourceSpans[ i ]->Write( s );
    }
-    count = fLocalToWorlds.GetCount();
+    count = fLocalToWorlds.size();
    s->WriteLE32(count);
    for( i = 0; i < count; i++ )
    {
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -203,10 +203,10 @@ void plReleaseObject(IUnknown* x)
 //// Local Static Stuff ///////////////////////////////////////////////////////
 /// Macros for getting/setting data in a D3D vertex buffer
-inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple& point )
+inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point )
 {
    register float* dst = (float*)ptr;
-    register const float* src = (float*)&point.fX;
+    register const float* src = (float*)&point->fX;
    *dst++ = *src++;
    *dst++ = *src++;
    *dst++ = *src++;
@ -217,10 +217,10 @@ inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint )
    *(uint32_t*)ptr = uint;
    return ptr + sizeof(uint);
 }
-inline uint8_t* inlExtractPoint( const uint8_t* ptr, const hsScalarTriple& pt )
+inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt )
 {
    register const float* src = (float*)ptr;
-    register float* dst = (float*)&pt.fX;
+    register float* dst = (float*)&pt->fX;
    *dst++ = *src++;
    *dst++ = *src++;
    *dst++ = *src++;
@ -10617,14 +10617,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
 // SSE3 version
 #ifdef HS_SSE3
 #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
-        ALIGN(16) float hack[4]; \
+        mc0 = _mm_load_ps(xfm.fMap[0]); \
-        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
+        mc1 = _mm_load_ps(xfm.fMap[1]); \
-        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
+        mc2 = _mm_load_ps(xfm.fMap[2]); \
        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
        mwt = _mm_set_ps1(wgt);
-#define MATRIXMULTPOINTADD_SSE3(dst, src) \
+#define MATRIXMULTBUFADD_SSE3(dst, src) \
-        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
+        msr = _mm_load_ps(src); \
        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
@ -10632,10 +10631,9 @@ inline void inlTESTPOINT(const hsPoint3& destP,
        hbuf1 = _mm_hadd_ps(_x, _y); \
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        _mm_store_ps(hack, hbuf1); \
+        _dst = _mm_load_ps(dst); \
-        dst.fX += hack[0]; \
+        _dst = _mm_add_ps(_dst, hbuf1); \
-        dst.fY += hack[1]; \
+        _mm_store_ps(dst, _dst);
        dst.fZ += hack[2];
 #define MATRIXMULTVECTORADD_SSE3(dst, src) \
        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
@ -10645,10 +10643,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
        hbuf1 = _mm_hadd_ps(_x, _y); \
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        _mm_store_ps(hack, hbuf1); \
+        { \
-        dst.fX += hack[0]; \
+            ALIGN(16) float hack[4]; \
-        dst.fY += hack[1]; \
+            _mm_store_ps(hack, hbuf1); \
-        dst.fZ += hack[2];
+            dst.fX += hack[0]; \
            dst.fY += hack[1]; \
            dst.fZ += hack[2]; \
        }
 #endif
 // CPU-optimized functions requiring dispatch
@ -10656,11 +10657,17 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
 // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
 #define BLENDVERTSTART \
-    uint8_t     numUVs, numWeights; \
+    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
-    uint32_t    i, j, indices, color, specColor, uvChanSize; \
+    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
-    float       weights[ 4 ], weightSum; \
+    ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
-    hsPoint3    pt, tempPt, destPt; \
+    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf); \
-    hsVector3   vec, tempNorm, destNorm; \
+    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf); \
    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
    \
    uint8_t         numUVs, numWeights; \
    uint32_t        i, j, indices, color, specColor, uvChanSize; \
    float           weights[ 4 ], weightSum; \
    \
    /* Get some counts */\
    switch( format & plGBufferGroup::kSkinWeightMask ) \
@ -10706,8 +10713,9 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
            src = inlExtractUInt32( src, specColor ); \
            \
            /* Blend */\
-            destPt.Set( 0, 0, 0 ); \
+            destPt->Set(0.f, 0.f, 0.f); \
-            destNorm.Set( 0, 0, 0 ); \
+            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
@ -10772,15 +10780,16 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
            uint8_t k; \
            for( k = 0; k < numUVs; k++ ) \
            { \
-                src = inlExtractPoint( src, srcUVWs[k] ); \
+                src = inlExtractPoint( src, &srcUVWs[k] ); \
            } \
            memcpy( dstUVWs, srcUVWs, uvChanSize); \
            dstUVWs[loChan].Set(0,0,0); \
            dstUVWs[hiChan].Set(0,0,0); \
            \
            /* Blend */\
-            destPt.Set( 0, 0, 0 ); \
+            destPt->Set(0.f, 0.f, 0.f); \
-            destNorm.Set( 0, 0, 0 ); \
+            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
@ -10823,13 +10832,13 @@ void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
    BLENDVERTSTART
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
-                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
    BLENDVERTMID
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
-                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
@ -10846,13 +10855,13 @@ void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
    BLENDVERTSTART
                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
-                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
    BLENDVERTMID
                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
-                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
    BLENDVERTEND