Merge pull request #336 from Hoikas/weight-format

Improve skinning performance.
12 years ago · b86ab9d69e
2 changed files with 236 additions and 307 deletions
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -203,40 +203,60 @@ void plReleaseObject(IUnknown* x)
 //// Local Static Stuff ///////////////////////////////////////////////////////
 /// Macros for getting/setting data in a D3D vertex buffer
-inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point )
+template<typename T>
 static inline void inlCopy(uint8_t*& src, uint8_t*& dst)
 {
-    register float* dst = (float*)ptr;
+    T* src_ptr = reinterpret_cast<T*>(src);
-    register const float* src = (float*)&point->fX;
+    T* dst_ptr = reinterpret_cast<T*>(dst);
-    *dst++ = *src++;
+    *dst_ptr = *src_ptr;
-    *dst++ = *src++;
+    src += sizeof(T);
-    *dst++ = *src++;
+    dst += sizeof(T);
    return (uint8_t*)dst;
 }
-inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint )
+
 template<typename T>
 static inline const uint8_t* inlExtract(const uint8_t* src, T* val)
 {
    const T* ptr = reinterpret_cast<const T*>(src);
    *val = *ptr++;
    return reinterpret_cast<const uint8_t*>(ptr);
 }
 template<>
 static inline const uint8_t* inlExtract<hsPoint3>(const uint8_t* src, hsPoint3* val)
 {
-    *(uint32_t*)ptr = uint;
+    const float* src_ptr = reinterpret_cast<const float*>(src);
-    return ptr + sizeof(uint);
+    float* dst_ptr = reinterpret_cast<float*>(val);
    *dst_ptr++ = *src_ptr++;
    *dst_ptr++ = *src_ptr++;
    *dst_ptr++ = *src_ptr++;
    *dst_ptr = 1.f;
    return reinterpret_cast<const uint8_t*>(src_ptr);
 }
-inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt )
+
 template<>
 static inline const uint8_t* inlExtract<hsVector3>(const uint8_t* src, hsVector3* val)
 {
-    register const float* src = (float*)ptr;
+    const float* src_ptr = reinterpret_cast<const float*>(src);
-    register float* dst = (float*)&pt->fX;
+    float* dst_ptr = reinterpret_cast<float*>(val);
-    *dst++ = *src++;
+    *dst_ptr++ = *src_ptr++;
-    *dst++ = *src++;
+    *dst_ptr++ = *src_ptr++;
-    *dst++ = *src++;
+    *dst_ptr++ = *src_ptr++;
-    return (uint8_t*)src;
+    *dst_ptr = 0.f;
    return reinterpret_cast<const uint8_t*>(src_ptr);
 }
-inline uint8_t* inlExtractFloat( const uint8_t*& ptr, float& f )
+
 template<typename T, size_t N>
 static inline void inlSkip(uint8_t*& src)
 {
-    register const float* src = (float*)ptr;
+    src += sizeof(T) * N;
    f = *src++;
    return (uint8_t*)src;
 }
-inline uint8_t* inlExtractUInt32( const uint8_t*& ptr, uint32_t& uint )
+
 template<typename T>
 static inline uint8_t* inlStuff(uint8_t* dst, const T* val)
 {
-    const uint32_t* src = (uint32_t*)ptr;
+    T* ptr = reinterpret_cast<T*>(dst);
-    uint = *src++;
+    *ptr++ = *val;
-    return (uint8_t*)src;
+    return reinterpret_cast<uint8_t*>(ptr);
 }
 inline DWORD F2DW( FLOAT f ) 
@ -9960,6 +9980,30 @@ void plDXPipeline::IFillStaticVertexBufferRef(plDXVertexBufferRef *ref, plGBuffe
    ref->SetDirty(false);
 }
 void plDXPipeline::IFillVolatileVertexBufferRef(plDXVertexBufferRef* ref, plGBufferGroup* group, uint32_t idx)
 {
    uint8_t* dst = ref->fData;
    uint8_t* src = group->GetVertBufferData(idx);
    size_t uvChanSize = plGBufferGroup::CalcNumUVs(group->GetVertexFormat()) * sizeof(float) * 3;
    uint8_t numWeights = (group->GetVertexFormat() & plGBufferGroup::kSkinWeightMask) >> 4;
    for (uint32_t i = 0; i < ref->fCount; ++i) {
        inlCopy<hsPoint3>(src, dst); // pre-pos
        src += numWeights * sizeof(float); // weights
        if (group->GetVertexFormat() & plGBufferGroup::kSkinIndices)
            inlSkip<uint32_t, 1>(src); // indices
        inlCopy<hsVector3>(src, dst); // pre-normal
        inlCopy<uint32_t>(src, dst); // diffuse
        inlCopy<uint32_t>(src, dst); // specular
        // UVWs
        memcpy(dst, src, uvChanSize);
        src += uvChanSize;
        dst += uvChanSize;
    }
 }
 // OpenAccess ////////////////////////////////////////////////////////////////////////////////////////
 // Lock the managed buffer and setup the accessSpan to point into the buffers data.
 bool plDXPipeline::OpenAccess(plAccessSpan& dst, plDrawableSpans* drawable, const plVertexSpan* span, bool readOnly)
@ -10114,6 +10158,7 @@ void plDXPipeline::CheckVertexBufferRef(plGBufferGroup* owner, uint32_t idx)
        if( !vRef->fData && (vRef->fFormat != owner->GetVertexFormat()) )
        {
            vRef->fData = new uint8_t[vRef->fCount * vRef->fVertexSize];
            IFillVolatileVertexBufferRef(vRef, owner, idx);
        }
    }
 }
@ -10581,292 +10626,177 @@ inline void inlTESTPOINT(const hsPoint3& destP,
 //  format, blends them into the destination buffer given without the blending
 //  info.
-// FPU version
+static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt,
-#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
+                                  const float* pt_src, float* pt_dst,
-        float m00 = xfm.fMap[0][0]; \
+                                  const float* vec_src, float* vec_dst)
-        float m01 = xfm.fMap[0][1]; \
+{
-        float m02 = xfm.fMap[0][2]; \
+    const float& m00 = xfm.fMap[0][0];
-        float m03 = xfm.fMap[0][3]; \
+    const float& m01 = xfm.fMap[0][1];
-        float m10 = xfm.fMap[1][0]; \
+    const float& m02 = xfm.fMap[0][2];
-        float m11 = xfm.fMap[1][1]; \
+    const float& m03 = xfm.fMap[0][3];
-        float m12 = xfm.fMap[1][2]; \
+    const float& m10 = xfm.fMap[1][0];
-        float m13 = xfm.fMap[1][3]; \
+    const float& m11 = xfm.fMap[1][1];
-        float m20 = xfm.fMap[2][0]; \
+    const float& m12 = xfm.fMap[1][2];
-        float m21 = xfm.fMap[2][1]; \
+    const float& m13 = xfm.fMap[1][3];
-        float m22 = xfm.fMap[2][2]; \
+    const float& m20 = xfm.fMap[2][0];
-        float m23 = xfm.fMap[2][3]; \
+    const float& m21 = xfm.fMap[2][1];
-        float m_wgt = wgt; \
+    const float& m22 = xfm.fMap[2][2];
-        float srcX, srcY, srcZ;
+    const float& m23 = xfm.fMap[2][3];
-#define MATRIXMULTPOINTADD_FPU(dst, src) \
+
-        srcX = src.fX; \
+    // position
-        srcY = src.fY; \
+    {
-        srcZ = src.fZ; \
+        const float& srcX = pt_src[0];
-        \
+        const float& srcY = pt_src[1];
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
+        const float& srcZ = pt_src[2];
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
+
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
+        pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt;
-#define MATRIXMULTVECTORADD_FPU(dst, src) \
+        pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt;
-        srcX = src.fX; \
+        pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt;
-        srcY = src.fY; \
+    }
-        srcZ = src.fZ; \
+
-        \
+    // normal
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
+    {
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
+        const float& srcX = vec_src[0];
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
+        const float& srcY = vec_src[1];
-
+        const float& srcZ = vec_src[2];
-// SSE3 version
+
        vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt;
        vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt;
        vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt;
    }
 }
 #ifdef HS_SSE3
-#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
+static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0,
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
+                               const __m128& mc1, const __m128& mc2, const __m128& mwt)
-        mc0 = _mm_load_ps(xfm.fMap[0]); \
+{
-        mc1 = _mm_load_ps(xfm.fMap[1]); \
+    __m128 msr = _mm_load_ps(src);
-        mc2 = _mm_load_ps(xfm.fMap[2]); \
+    __m128 _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt);
-        mwt = _mm_set_ps1(wgt);
+    __m128 _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt);
-#define MATRIXMULTBUFADD_SSE3(dst, src) \
+    __m128 _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt);
-        msr = _mm_load_ps(src); \
+
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+    __m128 hbuf1 = _mm_hadd_ps(_x, _y);
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+    __m128 hbuf2 = _mm_hadd_ps(_z, _z);
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
+    hbuf1 = _mm_hadd_ps(hbuf1, hbuf2);
-        \
+    __m128 _dst = _mm_load_ps(dst);
-        hbuf1 = _mm_hadd_ps(_x, _y); \
+    _dst = _mm_add_ps(_dst, hbuf1);
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
        _dst = _mm_load_ps(dst); \
        _dst = _mm_add_ps(_dst, hbuf1); \
    _mm_store_ps(dst, _dst);
 #define MATRIXMULTVECTORADD_SSE3(dst, src) \
        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
        \
        hbuf1 = _mm_hadd_ps(_x, _y); \
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
        { \
            ALIGN(16) float hack[4]; \
            _mm_store_ps(hack, hbuf1); \
            dst.fX += hack[0]; \
            dst.fY += hack[1]; \
            dst.fZ += hack[2]; \
 }
-#endif
+#endif // HS_SSE3
-// CPU-optimized functions requiring dispatch
+static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt,
-hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
+                                   const float* pt_src, float* pt_dst,
-
+                                   const float* vec_src, float* vec_dst)
 // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
 #define BLENDVERTSTART \
    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
    ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf); \
    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf); \
    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
    \
    uint8_t         numUVs, numWeights; \
    uint32_t        i, j, indices, color, specColor, uvChanSize; \
    float           weights[ 4 ], weightSum; \
    \
    /* Get some counts */\
    switch( format & plGBufferGroup::kSkinWeightMask ) \
    { \
        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
    } \
    \
    numUVs = plGBufferGroup::CalcNumUVs( format ); \
    uvChanSize = numUVs * sizeof( float ) * 3; \
    \
    /* localUVWChans is bump mapping tangent space vectors, which need to
    // be skinned like the normal, as opposed to passed through like 
    // garden variety UVW coordinates.
    // There are no localUVWChans that I know of in production assets (i.e.
    // the avatar is not skinned).*/\
    if( !localUVWChans ) \
    { \
        /* Copy whilst blending */\
        for( i = 0; i < count; i++ ) \
        { \
            /* Extract data */\
            src = inlExtractPoint( src, pt ); \
            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
            { \
                src = inlExtractFloat( src, weights[ j ] ); \
                weightSum += weights[ j ]; \
            } \
            weights[ j ] = 1 - weightSum; \
            \
            if( format & plGBufferGroup::kSkinIndices ) \
            { \
                src = inlExtractUInt32( src, indices ); \
            } \
            else \
            { \
                indices = 1 << 8; \
            } \
            src = inlExtractPoint( src, vec ); \
            src = inlExtractUInt32( src, color ); \
            src = inlExtractUInt32( src, specColor ); \
            \
            /* Blend */\
            destPt->Set(0.f, 0.f, 0.f); \
            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
 {
-                    /*
+#ifdef HS_SSE3
-                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
+    __m128 mc0 = _mm_load_ps(xfm.fMap[0]);
    __m128 mc1 = _mm_load_ps(xfm.fMap[1]);
    __m128 mc2 = _mm_load_ps(xfm.fMap[2]);
    __m128 mwt = _mm_set_ps1(wgt);
-                    MATRIXMULTPOINTADD(destPt, pt);
+    ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt);
-                    MATRIXMULTVECTORADD(destNorm, vec);
+    ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt);
-                    */
+#endif // HS_SSE3
-#define BLENDVERTMID \
+}
                } \
                \
                indices >>= 8; \
            } \
            /* Probably don't really need to renormalize this. There errors are
            // going to be subtle and "smooth".*/\
            /* hsFastMath::NormalizeAppr(destNorm);*/ \
            \
            /* Slam data into position now */\
            dest = inlStuffPoint( dest, destPt ); \
            dest = inlStuffPoint( dest, destNorm ); \
            dest = inlStuffUInt32( dest, color ); \
            dest = inlStuffUInt32( dest, specColor ); \
            memcpy( dest, src, uvChanSize ); \
            src += uvChanSize; \
            dest += uvChanSize; \
        } \
    } \
    else \
    { \
        uint8_t hiChan = localUVWChans >> 8; \
        uint8_t loChan = localUVWChans & 0xff; \
        /* Copy whilst blending */\
        for( i = 0; i < count; i++ ) \
        { \
            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
            \
            /* Extract data */\
            src = inlExtractPoint( src, pt ); \
            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
            { \
                src = inlExtractFloat( src, weights[ j ] ); \
                weightSum += weights[ j ]; \
            } \
            weights[ j ] = 1 - weightSum; \
            \
            if( format & plGBufferGroup::kSkinIndices ) \
            { \
                src = inlExtractUInt32( src, indices ); \
            } \
            else \
            { \
                indices = 1 << 8; \
            } \
            \
            src = inlExtractPoint( src, vec ); \
            src = inlExtractUInt32( src, color ); \
            src = inlExtractUInt32( src, specColor ); \
            \
            uint8_t k; \
            for( k = 0; k < numUVs; k++ ) \
            { \
                src = inlExtractPoint( src, &srcUVWs[k] ); \
            } \
            memcpy( dstUVWs, srcUVWs, uvChanSize); \
            dstUVWs[loChan].Set(0,0,0); \
            dstUVWs[hiChan].Set(0,0,0); \
            \
            /* Blend */\
            destPt->Set(0.f, 0.f, 0.f); \
            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
                { \
                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD(destPt, pt);
+#ifdef HS_SSE41
-                    MATRIXMULTVECTORADD(destNorm, vec);
+static inline void ISkinDpSSE41(const float* src, float* dst, const __m128& mc0,
-                    MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
+                                const __m128& mc1, const __m128& mc2, const __m128& mwt)
                    MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
                    */
 #define BLENDVERTEND \
                } \
                \
                indices >>= 8; \
            } \
            /* Probably don't really need to renormalize this. There errors are
            // going to be subtle and "smooth". */\
            /* hsFastMath::NormalizeAppr(destNorm); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
            \
            /* Slam data into position now */\
            dest = inlStuffPoint( dest, destPt ); \
            dest = inlStuffPoint( dest, destNorm ); \
            dest = inlStuffUInt32( dest, color ); \
            dest = inlStuffUInt32( dest, specColor ); \
            memcpy( dest, dstUVWs, uvChanSize ); \
            dest += uvChanSize; \
        } \
    }
 void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
                                          hsMatrix44* matrixPalette, int numMatrices,
                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
                                          uint8_t *dest, uint32_t destStride, uint32_t count,
                                          uint16_t localUVWChans )
 {
-    BLENDVERTSTART
+    enum { DP_F4_X = 0xF1, DP_F4_Y = 0xF2, DP_F4_Z = 0xF4 };
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+    __m128 msr = _mm_load_ps(src);
-                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
+    __m128 _r =        _mm_dp_ps(msr, mc0, DP_F4_X);
-    BLENDVERTMID
+    _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc1, DP_F4_Y));
-                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
+    _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc2, DP_F4_Z));
-                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+    __m128 _dst = _mm_load_ps(dst);
-                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
+    _dst = _mm_add_ps(_dst, _mm_mul_ps(_r, mwt));
-                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
+    _mm_store_ps(dst, _dst);
-                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
+}
 #endif // HS_SSE41
 static inline void ISkinVertexSSE41(const hsMatrix44& xfm, float wgt,
                                    const float* pt_src, float* pt_dst,
                                    const float* vec_src, float* vec_dst)
 {
 #ifdef HS_SSE41
    __m128 mc0 = _mm_load_ps(xfm.fMap[0]);
    __m128 mc1 = _mm_load_ps(xfm.fMap[1]);
    __m128 mc2 = _mm_load_ps(xfm.fMap[2]);
    __m128 mwt = _mm_set_ps1(wgt);
-    BLENDVERTEND
+    ISkinDpSSE41(pt_src, pt_dst, mc0, mc1, mc2, mwt);
    ISkinDpSSE41(vec_src, vec_dst, mc0, mc1, mc2, mwt);
 #endif // HS_SSE41
 }
-void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
+typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*);
-                                           hsMatrix44* matrixPalette, int numMatrices,
+
 template<skin_vert_ptr T>
 static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices,
                             const uint8_t* src, uint8_t format, uint32_t srcStride,
                             uint8_t* dest, uint32_t destStride, uint32_t count,
                             uint16_t localUVWChans)
 {
-#ifdef HS_SSE3
+    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f };
-    BLENDVERTSTART
+    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f };
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf);
-
+    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf);
-                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+
-                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
+    uint32_t        indices;
-    BLENDVERTMID
+    float           weights[4];
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+
-
+    // Dropped support for localUVWChans at templatization of code
-                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+    hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?");
-                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
+    const size_t uvChanSize = plGBufferGroup::CalcNumUVs(format) * sizeof(float) * 3;
-                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
+    uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4;
-                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
+
-    BLENDVERTEND
+    for (uint32_t i = 0; i < count; ++i) {
-#endif // HS_SSE3
+        // Extract data
        src = inlExtract<hsPoint3>(src, pt);
        float weightSum = 0.f;
        for (uint8_t j = 0; j < numWeights; ++j) {
            src = inlExtract<float>(src, &weights[j]);
            weightSum += weights[j];
        }
        weights[numWeights] = 1.f - weightSum;
        if (format & plGBufferGroup::kSkinIndices)
            src = inlExtract<uint32_t>(src, &indices);
        else
            indices = 1 << 8;
        src = inlExtract<hsVector3>(src, vec);
        // Destination buffers (float4 for SSE alignment)
        ALIGN(16) float destNorm_buf[] = { 0.f, 0.f, 0.f, 0.f };
        ALIGN(16) float destPt_buf[] = { 0.f, 0.f, 0.f, 1.f };
        // Blend
        for (uint32_t j = 0; j < numWeights + 1; ++j) {
            if (weights[j])
                T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
            indices >>= 8;
        }
        // Probably don't really need to renormalize this. There errors are
        // going to be subtle and "smooth".
        /* hsFastMath::NormalizeAppr(destNorm); */
        // Slam data into position now
        dest = inlStuff<hsPoint3>(dest, reinterpret_cast<hsPoint3*>(destPt_buf));
        dest = inlStuff<hsVector3>(dest, reinterpret_cast<hsVector3*>(destNorm_buf));
        // Jump past colors and UVws
        dest += sizeof(uint32_t) * 2 + uvChanSize;
        src  += sizeof(uint32_t) * 2 + uvChanSize;
    }
 }
 // CPU-optimized functions requiring dispatch
 hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
    IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>, 0,
    IBlendVertBuffer<ISkinVertexSSE41>);
 // ISetPipeConsts //////////////////////////////////////////////////////////////////
 // A shader can request that the pipeline fill in certain constants that are indeterminate
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@ -354,6 +354,7 @@ protected:
    void            ICheckStaticVertexBuffer(plDXVertexBufferRef* vRef, plGBufferGroup* owner, uint32_t idx);
    void            ICheckIndexBuffer(plDXIndexBufferRef* iRef);
    void            IFillStaticVertexBufferRef(plDXVertexBufferRef *ref, plGBufferGroup *group, uint32_t idx);
    void            IFillVolatileVertexBufferRef(plDXVertexBufferRef* ref, plGBufferGroup* group, uint32_t idx);
    void            IFillIndexBufferRef(plDXIndexBufferRef* iRef, plGBufferGroup* owner, uint32_t idx);
    void            ISetupVertexBufferRef(plGBufferGroup* owner, uint32_t idx, plDXVertexBufferRef* vRef);
    void            ISetupIndexBufferRef(plGBufferGroup* owner, uint32_t idx, plDXIndexBufferRef* iRef);
@ -804,8 +805,6 @@ public:
    //  CPU-optimized functions
 protected:
    typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };