Cleanup macro-mayhem

This converts the VERTBLEND macros to some clever templates. This code should be much more maintainable.
12 years ago · aa7df368f1
2 changed files with 141 additions and 282 deletions
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -10581,293 +10581,154 @@ inline void inlTESTPOINT(const hsPoint3& destP,
 //  format, blends them into the destination buffer given without the blending
 //  info.
-// FPU version
+static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt,
-#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
+                                  const float* pt_src, float* pt_dst,
-        float m00 = xfm.fMap[0][0]; \
+                                  const float* vec_src, float* vec_dst)
-        float m01 = xfm.fMap[0][1]; \
+{
-        float m02 = xfm.fMap[0][2]; \
+    const float& m00 = xfm.fMap[0][0];
-        float m03 = xfm.fMap[0][3]; \
+    const float& m01 = xfm.fMap[0][1];
-        float m10 = xfm.fMap[1][0]; \
+    const float& m02 = xfm.fMap[0][2];
-        float m11 = xfm.fMap[1][1]; \
+    const float& m03 = xfm.fMap[0][3];
-        float m12 = xfm.fMap[1][2]; \
+    const float& m10 = xfm.fMap[1][0];
-        float m13 = xfm.fMap[1][3]; \
+    const float& m11 = xfm.fMap[1][1];
-        float m20 = xfm.fMap[2][0]; \
+    const float& m12 = xfm.fMap[1][2];
-        float m21 = xfm.fMap[2][1]; \
+    const float& m13 = xfm.fMap[1][3];
-        float m22 = xfm.fMap[2][2]; \
+    const float& m20 = xfm.fMap[2][0];
-        float m23 = xfm.fMap[2][3]; \
+    const float& m21 = xfm.fMap[2][1];
-        float m_wgt = wgt; \
+    const float& m22 = xfm.fMap[2][2];
-        float srcX, srcY, srcZ;
+    const float& m23 = xfm.fMap[2][3];
-#define MATRIXMULTPOINTADD_FPU(dst, src) \
+
-        srcX = src.fX; \
+    // position
-        srcY = src.fY; \
+    {
-        srcZ = src.fZ; \
+        const float& srcX = pt_src[0];
-        \
+        const float& srcY = pt_src[1];
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
+        const float& srcZ = pt_src[2];
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
+
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
+        pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt;
-#define MATRIXMULTVECTORADD_FPU(dst, src) \
+        pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt;
-        srcX = src.fX; \
+        pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt;
-        srcY = src.fY; \
+    }
-        srcZ = src.fZ; \
+
-        \
+    // normal
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
+    {
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
+        const float& srcX = vec_src[0];
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
+        const float& srcY = vec_src[1];
-
+        const float& srcZ = vec_src[2];
-// SSE3 version
+
        vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt;
        vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt;
        vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt;
    }
 }
 #ifdef HS_SSE3
-#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
+static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0,
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
+                               const __m128& mc1, const __m128& mc2, const __m128& mwt)
-        mc0 = _mm_load_ps(xfm.fMap[0]); \
+{
-        mc1 = _mm_load_ps(xfm.fMap[1]); \
+    __m128 msr = _mm_load_ps(src);
-        mc2 = _mm_load_ps(xfm.fMap[2]); \
+    __m128 _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt);
-        mwt = _mm_set_ps1(wgt);
+    __m128 _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt);
-#define MATRIXMULTBUFADD_SSE3(dst, src) \
+    __m128 _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt);
-        msr = _mm_load_ps(src); \
+
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+    __m128 hbuf1 = _mm_hadd_ps(_x, _y);
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+    __m128 hbuf2 = _mm_hadd_ps(_z, _z);
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
+    hbuf1 = _mm_hadd_ps(hbuf1, hbuf2);
-        \
+    __m128 _dst = _mm_load_ps(dst);
-        hbuf1 = _mm_hadd_ps(_x, _y); \
+    _dst = _mm_add_ps(_dst, hbuf1);
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
        _dst = _mm_load_ps(dst); \
        _dst = _mm_add_ps(_dst, hbuf1); \
    _mm_store_ps(dst, _dst);
-#define MATRIXMULTVECTORADD_SSE3(dst, src) \
+}
-        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
+#endif // HS_SSE3
        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
        \
        hbuf1 = _mm_hadd_ps(_x, _y); \
        hbuf2 = _mm_hadd_ps(_z, _z); \
        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
        { \
            ALIGN(16) float hack[4]; \
            _mm_store_ps(hack, hbuf1); \
            dst.fX += hack[0]; \
            dst.fY += hack[1]; \
            dst.fZ += hack[2]; \
        }
 #endif
-// CPU-optimized functions requiring dispatch
+static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt,
-hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
+                                   const float* pt_src, float* pt_dst,
-
+                                   const float* vec_src, float* vec_dst)
 // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
 #define BLENDVERTSTART \
    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
    ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf); \
    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf); \
    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
    \
    uint8_t         numUVs, numWeights; \
    uint32_t        i, j, indices, color, specColor, uvChanSize; \
    float           weights[ 4 ], weightSum; \
    \
    /* Get some counts */\
    switch( format & plGBufferGroup::kSkinWeightMask ) \
    { \
        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
    } \
    \
    numUVs = plGBufferGroup::CalcNumUVs( format ); \
    uvChanSize = numUVs * sizeof( float ) * 3; \
    \
    /* localUVWChans is bump mapping tangent space vectors, which need to
    // be skinned like the normal, as opposed to passed through like 
    // garden variety UVW coordinates.
    // There are no localUVWChans that I know of in production assets (i.e.
    // the avatar is not skinned).*/\
    if( !localUVWChans ) \
    { \
        /* Copy whilst blending */\
        for( i = 0; i < count; i++ ) \
        { \
            /* Extract data */\
            src = inlExtractPoint( src, pt ); \
            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
            { \
                src = inlExtractFloat( src, weights[ j ] ); \
                weightSum += weights[ j ]; \
            } \
            weights[ j ] = 1 - weightSum; \
            \
            if( format & plGBufferGroup::kSkinIndices ) \
            { \
                src = inlExtractUInt32( src, indices ); \
            } \
            else \
            { \
                indices = 1 << 8; \
            } \
            src = inlExtractPoint( src, vec ); \
            src = inlExtractUInt32( src, color ); \
            src = inlExtractUInt32( src, specColor ); \
            \
            /* Blend */\
            destPt->Set(0.f, 0.f, 0.f); \
            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
                {
                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
                    */
 #define BLENDVERTMID \
                } \
                \
                indices >>= 8; \
            } \
            /* Probably don't really need to renormalize this. There errors are
            // going to be subtle and "smooth".*/\
            /* hsFastMath::NormalizeAppr(destNorm);*/ \
            \
            /* Slam data into position now */\
            dest = inlStuffPoint( dest, destPt ); \
            dest = inlStuffPoint( dest, destNorm ); \
            dest = inlStuffUInt32( dest, color ); \
            dest = inlStuffUInt32( dest, specColor ); \
            memcpy( dest, src, uvChanSize ); \
            src += uvChanSize; \
            dest += uvChanSize; \
        } \
    } \
    else \
    { \
        uint8_t hiChan = localUVWChans >> 8; \
        uint8_t loChan = localUVWChans & 0xff; \
        /* Copy whilst blending */\
        for( i = 0; i < count; i++ ) \
        { \
            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
            \
            /* Extract data */\
            src = inlExtractPoint( src, pt ); \
            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
            { \
                src = inlExtractFloat( src, weights[ j ] ); \
                weightSum += weights[ j ]; \
            } \
            weights[ j ] = 1 - weightSum; \
            \
            if( format & plGBufferGroup::kSkinIndices ) \
            { \
                src = inlExtractUInt32( src, indices ); \
            } \
            else \
            { \
                indices = 1 << 8; \
            } \
            \
            src = inlExtractPoint( src, vec ); \
            src = inlExtractUInt32( src, color ); \
            src = inlExtractUInt32( src, specColor ); \
            \
            uint8_t k; \
            for( k = 0; k < numUVs; k++ ) \
            { \
                src = inlExtractPoint( src, &srcUVWs[k] ); \
            } \
            memcpy( dstUVWs, srcUVWs, uvChanSize); \
            dstUVWs[loChan].Set(0,0,0); \
            dstUVWs[hiChan].Set(0,0,0); \
            \
            /* Blend */\
            destPt->Set(0.f, 0.f, 0.f); \
            destPt_buf[3] = 1.f; \
            destNorm->Set(0.f, 0.f, 0.f); \
            for( j = 0; j < numWeights + 1; j++ ) \
            { \
                if( weights[ j ] ) \
                { \
                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
                    MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
                    */
 #define BLENDVERTEND \
                } \
                \
                indices >>= 8; \
            } \
            /* Probably don't really need to renormalize this. There errors are
            // going to be subtle and "smooth". */\
            /* hsFastMath::NormalizeAppr(destNorm); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
            \
            /* Slam data into position now */\
            dest = inlStuffPoint( dest, destPt ); \
            dest = inlStuffPoint( dest, destNorm ); \
            dest = inlStuffUInt32( dest, color ); \
            dest = inlStuffUInt32( dest, specColor ); \
            memcpy( dest, dstUVWs, uvChanSize ); \
            dest += uvChanSize; \
        } \
    }
 void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
                                          hsMatrix44* matrixPalette, int numMatrices,
                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
                                          uint8_t *dest, uint32_t destStride, uint32_t count,
                                          uint16_t localUVWChans )
 {
    BLENDVERTSTART
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
    BLENDVERTMID
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
    BLENDVERTEND
 }
 void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
                                           hsMatrix44* matrixPalette, int numMatrices,
                                           const uint8_t *src, uint8_t format, uint32_t srcStride,
                                           uint8_t *dest, uint32_t destStride, uint32_t count,
                                           uint16_t localUVWChans )
 {
 #ifdef HS_SSE3
-    BLENDVERTSTART
+    __m128 mc0 = _mm_load_ps(xfm.fMap[0]);
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+    __m128 mc1 = _mm_load_ps(xfm.fMap[1]);
-
+    __m128 mc2 = _mm_load_ps(xfm.fMap[2]);
-                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+    __m128 mwt = _mm_set_ps1(wgt);
-                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
+
-    BLENDVERTMID
+    ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt);
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+    ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt);
                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
    BLENDVERTEND
 #endif // HS_SSE3
 }
 typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*);
 template<skin_vert_ptr T>
 static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices,
                             const uint8_t* src, uint8_t format, uint32_t srcStride,
                             uint8_t* dest, uint32_t destStride, uint32_t count,
                             uint16_t localUVWChans)
 {
    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f };
    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f };
    ALIGN(16) float destPt_buf[4], destNorm_buf[4];
    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf);
    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf);
    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf);
    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf);
    uint8_t         numUVs;
    uint32_t        indices, color, specColor, uvChanSize;
    float           weights[4];
    numUVs = plGBufferGroup::CalcNumUVs(format);
    uvChanSize = numUVs * sizeof(float) * 3;
    uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4;
    // Dropped support for localUVWChans at templatization of code
    hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?");
    for (uint32_t i = 0; i < count; ++i) {
        // Extract data
        src = inlExtractPoint( src, pt );
        float weightSum = 0.f;
        for (uint8_t j = 0; j < numWeights; ++j) {
            src = inlExtractFloat(src, weights[j]);
            weightSum += weights[j];
        }
        weights[numWeights] = 1.f - weightSum;
        if (format & plGBufferGroup::kSkinIndices)
            src = inlExtractUInt32( src, indices );
        else
            indices = 1 << 8;
        src = inlExtractPoint( src, vec );
        src = inlExtractUInt32( src, color );
        src = inlExtractUInt32( src, specColor );
        // Blend
        destPt->Set(0.f, 0.f, 0.f);
        destPt_buf[3] = 1.f;
        destNorm->Set(0.f, 0.f, 0.f);
        destNorm_buf[3] = 0.f;
        for (uint32_t j = 0; j < numWeights + 1; ++j) {
            if (weights[j])
                T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
            indices >>= 8;
        }
        // Probably don't really need to renormalize this. There errors are
        // going to be subtle and "smooth".
        /* hsFastMath::NormalizeAppr(destNorm); */
        // Slam data into position now
        dest = inlStuffPoint( dest, destPt );
        dest = inlStuffPoint( dest, destNorm );
        dest = inlStuffUInt32( dest, color );
        dest = inlStuffUInt32( dest, specColor );
        memcpy( dest, src, uvChanSize );
        src += uvChanSize;
        dest += uvChanSize;
    }
 }
 // CPU-optimized functions requiring dispatch
 hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
    IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>);
 // ISetPipeConsts //////////////////////////////////////////////////////////////////
 // A shader can request that the pipeline fill in certain constants that are indeterminate
 // until the pipeline is about to render the object the shader is applied to. For example,
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@ -804,8 +804,6 @@ public:
    //  CPU-optimized functions
 protected:
    typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };