diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index d4ad28d4..4ee901ef 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -10581,293 +10581,154 @@ inline void inlTESTPOINT(const hsPoint3& destP,
 //  format, blends them into the destination buffer given without the blending
 //  info.
 
-// FPU version
-#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
-        float m00 = xfm.fMap[0][0]; \
-        float m01 = xfm.fMap[0][1]; \
-        float m02 = xfm.fMap[0][2]; \
-        float m03 = xfm.fMap[0][3]; \
-        float m10 = xfm.fMap[1][0]; \
-        float m11 = xfm.fMap[1][1]; \
-        float m12 = xfm.fMap[1][2]; \
-        float m13 = xfm.fMap[1][3]; \
-        float m20 = xfm.fMap[2][0]; \
-        float m21 = xfm.fMap[2][1]; \
-        float m22 = xfm.fMap[2][2]; \
-        float m23 = xfm.fMap[2][3]; \
-        float m_wgt = wgt; \
-        float srcX, srcY, srcZ;
-#define MATRIXMULTPOINTADD_FPU(dst, src) \
-        srcX = src.fX; \
-        srcY = src.fY; \
-        srcZ = src.fZ; \
-        \
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
-#define MATRIXMULTVECTORADD_FPU(dst, src) \
-        srcX = src.fX; \
-        srcY = src.fY; \
-        srcZ = src.fZ; \
-        \
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
-
-// SSE3 version
+static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt,
+                                  const float* pt_src, float* pt_dst,
+                                  const float* vec_src, float* vec_dst)
+{
+    const float& m00 = xfm.fMap[0][0];
+    const float& m01 = xfm.fMap[0][1];
+    const float& m02 = xfm.fMap[0][2];
+    const float& m03 = xfm.fMap[0][3];
+    const float& m10 = xfm.fMap[1][0];
+    const float& m11 = xfm.fMap[1][1];
+    const float& m12 = xfm.fMap[1][2];
+    const float& m13 = xfm.fMap[1][3];
+    const float& m20 = xfm.fMap[2][0];
+    const float& m21 = xfm.fMap[2][1];
+    const float& m22 = xfm.fMap[2][2];
+    const float& m23 = xfm.fMap[2][3];
+
+    // position
+    {
+        const float& srcX = pt_src[0];
+        const float& srcY = pt_src[1];
+        const float& srcZ = pt_src[2];
+
+        pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt;
+        pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt;
+        pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt;
+    }
+
+    // normal
+    {
+        const float& srcX = vec_src[0];
+        const float& srcY = vec_src[1];
+        const float& srcZ = vec_src[2];
+
+        vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt;
+        vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt;
+        vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt;
+    }
+}
+
 #ifdef HS_SSE3
-#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
-        mc0 = _mm_load_ps(xfm.fMap[0]); \
-        mc1 = _mm_load_ps(xfm.fMap[1]); \
-        mc2 = _mm_load_ps(xfm.fMap[2]); \
-        mwt = _mm_set_ps1(wgt);
-#define MATRIXMULTBUFADD_SSE3(dst, src) \
-        msr = _mm_load_ps(src); \
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-        \
-        hbuf1 = _mm_hadd_ps(_x, _y); \
-        hbuf2 = _mm_hadd_ps(_z, _z); \
-        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        _dst = _mm_load_ps(dst); \
-        _dst = _mm_add_ps(_dst, hbuf1); \
-        _mm_store_ps(dst, _dst);
-#define MATRIXMULTVECTORADD_SSE3(dst, src) \
-        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-        \
-        hbuf1 = _mm_hadd_ps(_x, _y); \
-        hbuf2 = _mm_hadd_ps(_z, _z); \
-        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        { \
-            ALIGN(16) float hack[4]; \
-            _mm_store_ps(hack, hbuf1); \
-            dst.fX += hack[0]; \
-            dst.fY += hack[1]; \
-            dst.fZ += hack[2]; \
-        }
-#endif
+static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0,
+                               const __m128& mc1, const __m128& mc2, const __m128& mwt)
+{
+    __m128 msr = _mm_load_ps(src);
+    __m128 _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt);
+    __m128 _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt);
+    __m128 _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt);
 
-// CPU-optimized functions requiring dispatch
-hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
-
-// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
-#define BLENDVERTSTART \
-    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
-    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
-    ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
-    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf); \
-    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
-    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf); \
-    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
-    \
-    uint8_t         numUVs, numWeights; \
-    uint32_t        i, j, indices, color, specColor, uvChanSize; \
-    float           weights[ 4 ], weightSum; \
-    \
-    /* Get some counts */\
-    switch( format & plGBufferGroup::kSkinWeightMask ) \
-    { \
-        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
-        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
-        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
-        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
-    } \
-    \
-    numUVs = plGBufferGroup::CalcNumUVs( format ); \
-    uvChanSize = numUVs * sizeof( float ) * 3; \
-    \
-    /* localUVWChans is bump mapping tangent space vectors, which need to
-    // be skinned like the normal, as opposed to passed through like 
-    // garden variety UVW coordinates.
-    // There are no localUVWChans that I know of in production assets (i.e.
-    // the avatar is not skinned).*/\
-    if( !localUVWChans ) \
-    { \
-        /* Copy whilst blending */\
-        for( i = 0; i < count; i++ ) \
-        { \
-            /* Extract data */\
-            src = inlExtractPoint( src, pt ); \
-            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
-            { \
-                src = inlExtractFloat( src, weights[ j ] ); \
-                weightSum += weights[ j ]; \
-            } \
-            weights[ j ] = 1 - weightSum; \
-            \
-            if( format & plGBufferGroup::kSkinIndices ) \
-            { \
-                src = inlExtractUInt32( src, indices ); \
-            } \
-            else \
-            { \
-                indices = 1 << 8; \
-            } \
-            src = inlExtractPoint( src, vec ); \
-            src = inlExtractUInt32( src, color ); \
-            src = inlExtractUInt32( src, specColor ); \
-            \
-            /* Blend */\
-            destPt->Set(0.f, 0.f, 0.f); \
-            destPt_buf[3] = 1.f; \
-            destNorm->Set(0.f, 0.f, 0.f); \
-            for( j = 0; j < numWeights + 1; j++ ) \
-            { \
-                if( weights[ j ] ) \
-                {
-                    /*
-                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTPOINTADD(destPt, pt);
-                    MATRIXMULTVECTORADD(destNorm, vec);
-                    */
-#define BLENDVERTMID \
-                } \
-                \
-                indices >>= 8; \
-            } \
-            /* Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth".*/\
-            /* hsFastMath::NormalizeAppr(destNorm);*/ \
-            \
-            /* Slam data into position now */\
-            dest = inlStuffPoint( dest, destPt ); \
-            dest = inlStuffPoint( dest, destNorm ); \
-            dest = inlStuffUInt32( dest, color ); \
-            dest = inlStuffUInt32( dest, specColor ); \
-            memcpy( dest, src, uvChanSize ); \
-            src += uvChanSize; \
-            dest += uvChanSize; \
-        } \
-    } \
-    else \
-    { \
-        uint8_t hiChan = localUVWChans >> 8; \
-        uint8_t loChan = localUVWChans & 0xff; \
-        /* Copy whilst blending */\
-        for( i = 0; i < count; i++ ) \
-        { \
-            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
-            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
-            \
-            /* Extract data */\
-            src = inlExtractPoint( src, pt ); \
-            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
-            { \
-                src = inlExtractFloat( src, weights[ j ] ); \
-                weightSum += weights[ j ]; \
-            } \
-            weights[ j ] = 1 - weightSum; \
-            \
-            if( format & plGBufferGroup::kSkinIndices ) \
-            { \
-                src = inlExtractUInt32( src, indices ); \
-            } \
-            else \
-            { \
-                indices = 1 << 8; \
-            } \
-            \
-            src = inlExtractPoint( src, vec ); \
-            src = inlExtractUInt32( src, color ); \
-            src = inlExtractUInt32( src, specColor ); \
-            \
-            uint8_t k; \
-            for( k = 0; k < numUVs; k++ ) \
-            { \
-                src = inlExtractPoint( src, &srcUVWs[k] ); \
-            } \
-            memcpy( dstUVWs, srcUVWs, uvChanSize); \
-            dstUVWs[loChan].Set(0,0,0); \
-            dstUVWs[hiChan].Set(0,0,0); \
-            \
-            /* Blend */\
-            destPt->Set(0.f, 0.f, 0.f); \
-            destPt_buf[3] = 1.f; \
-            destNorm->Set(0.f, 0.f, 0.f); \
-            for( j = 0; j < numWeights + 1; j++ ) \
-            { \
-                if( weights[ j ] ) \
-                { \
-                    /*
-                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTPOINTADD(destPt, pt);
-                    MATRIXMULTVECTORADD(destNorm, vec);
-                    MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
-                    MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
-                    */
-#define BLENDVERTEND \
-                } \
-                \
-                indices >>= 8; \
-            } \
-            /* Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth". */\
-            /* hsFastMath::NormalizeAppr(destNorm); */\
-            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
-            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
-            \
-            /* Slam data into position now */\
-            dest = inlStuffPoint( dest, destPt ); \
-            dest = inlStuffPoint( dest, destNorm ); \
-            dest = inlStuffUInt32( dest, color ); \
-            dest = inlStuffUInt32( dest, specColor ); \
-            memcpy( dest, dstUVWs, uvChanSize ); \
-            dest += uvChanSize; \
-        } \
-    }
-
-void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
-                                          hsMatrix44* matrixPalette, int numMatrices,
-                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
-                                          uint8_t *dest, uint32_t destStride, uint32_t count,
-                                          uint16_t localUVWChans )
-{
-    BLENDVERTSTART
-                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
-                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
-    BLENDVERTMID
-                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
-                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
-                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
-                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
-
-    BLENDVERTEND
-}
-
-void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
-                                           hsMatrix44* matrixPalette, int numMatrices,
-                                           const uint8_t *src, uint8_t format, uint32_t srcStride,
-                                           uint8_t *dest, uint32_t destStride, uint32_t count,
-                                           uint16_t localUVWChans )
+    __m128 hbuf1 = _mm_hadd_ps(_x, _y);
+    __m128 hbuf2 = _mm_hadd_ps(_z, _z);
+    hbuf1 = _mm_hadd_ps(hbuf1, hbuf2);
+    __m128 _dst = _mm_load_ps(dst);
+    _dst = _mm_add_ps(_dst, hbuf1);
+    _mm_store_ps(dst, _dst);
+}
+#endif // HS_SSE3
+
+static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt,
+                                   const float* pt_src, float* pt_dst,
+                                   const float* vec_src, float* vec_dst)
 {
 #ifdef HS_SSE3
-    BLENDVERTSTART
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
-                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
-    BLENDVERTMID
-                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
-
-                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
-                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
-                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
-                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
-    BLENDVERTEND
+    __m128 mc0 = _mm_load_ps(xfm.fMap[0]);
+    __m128 mc1 = _mm_load_ps(xfm.fMap[1]);
+    __m128 mc2 = _mm_load_ps(xfm.fMap[2]);
+    __m128 mwt = _mm_set_ps1(wgt);
+
+    ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt);
+    ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt);
 #endif // HS_SSE3
 }
 
+typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*);
+
+template<skin_vert_ptr T>
+static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices,
+                             const uint8_t* src, uint8_t format, uint32_t srcStride,
+                             uint8_t* dest, uint32_t destStride, uint32_t count,
+                             uint16_t localUVWChans)
+{
+    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f };
+    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f };
+    ALIGN(16) float destPt_buf[4], destNorm_buf[4];
+    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf);
+    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf);
+    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf);
+    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf);
+
+    uint8_t         numUVs;
+    uint32_t        indices, color, specColor, uvChanSize;
+    float           weights[4];
+
+    numUVs = plGBufferGroup::CalcNumUVs(format);
+    uvChanSize = numUVs * sizeof(float) * 3;
+    uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4;
+
+    // Dropped support for localUVWChans at templatization of code
+    hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?");
+
+    for (uint32_t i = 0; i < count; ++i) {
+        // Extract data
+        src = inlExtractPoint( src, pt );
+
+        float weightSum = 0.f;
+        for (uint8_t j = 0; j < numWeights; ++j) {
+            src = inlExtractFloat(src, weights[j]);
+            weightSum += weights[j];
+        }
+        weights[numWeights] = 1.f - weightSum;
+
+        if (format & plGBufferGroup::kSkinIndices)
+            src = inlExtractUInt32( src, indices );
+        else
+            indices = 1 << 8;
+        src = inlExtractPoint( src, vec );
+        src = inlExtractUInt32( src, color );
+        src = inlExtractUInt32( src, specColor );
+
+        // Blend
+        destPt->Set(0.f, 0.f, 0.f);
+        destPt_buf[3] = 1.f;
+        destNorm->Set(0.f, 0.f, 0.f);
+        destNorm_buf[3] = 0.f;
+        for (uint32_t j = 0; j < numWeights + 1; ++j) {
+            if (weights[j])
+                T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
+            indices >>= 8;
+        }
+        // Probably don't really need to renormalize this. There errors are
+        // going to be subtle and "smooth".
+        /* hsFastMath::NormalizeAppr(destNorm); */
+
+        // Slam data into position now
+        dest = inlStuffPoint( dest, destPt );
+        dest = inlStuffPoint( dest, destNorm );
+        dest = inlStuffUInt32( dest, color );
+        dest = inlStuffUInt32( dest, specColor );
+        memcpy( dest, src, uvChanSize );
+        src += uvChanSize;
+        dest += uvChanSize;
+    }
+}
+
+// CPU-optimized functions requiring dispatch
+hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
+    IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>);
+
 // ISetPipeConsts //////////////////////////////////////////////////////////////////
 // A shader can request that the pipeline fill in certain constants that are indeterminate
 // until the pipeline is about to render the object the shader is applied to. For example,
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
index b14cc03c..0cc8dacf 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@@ -804,8 +804,6 @@ public:
     //  CPU-optimized functions
 protected:
     typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
-    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
-    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
     static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };