diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index d4ad28d4..4ee901ef 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -10581,293 +10581,154 @@ inline void inlTESTPOINT(const hsPoint3& destP, // format, blends them into the destination buffer given without the blending // info. -// FPU version -#define MATRIXMULTBEGIN_FPU(xfm, wgt) \ - float m00 = xfm.fMap[0][0]; \ - float m01 = xfm.fMap[0][1]; \ - float m02 = xfm.fMap[0][2]; \ - float m03 = xfm.fMap[0][3]; \ - float m10 = xfm.fMap[1][0]; \ - float m11 = xfm.fMap[1][1]; \ - float m12 = xfm.fMap[1][2]; \ - float m13 = xfm.fMap[1][3]; \ - float m20 = xfm.fMap[2][0]; \ - float m21 = xfm.fMap[2][1]; \ - float m22 = xfm.fMap[2][2]; \ - float m23 = xfm.fMap[2][3]; \ - float m_wgt = wgt; \ - float srcX, srcY, srcZ; -#define MATRIXMULTPOINTADD_FPU(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; -#define MATRIXMULTVECTORADD_FPU(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; - -// SSE3 version +static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) +{ + const float& m00 = xfm.fMap[0][0]; + const float& m01 = xfm.fMap[0][1]; + const float& m02 = xfm.fMap[0][2]; + const float& m03 = xfm.fMap[0][3]; + const float& m10 = xfm.fMap[1][0]; + const float& m11 = xfm.fMap[1][1]; + const float& m12 = xfm.fMap[1][2]; + const float& m13 = xfm.fMap[1][3]; + const float& m20 = xfm.fMap[2][0]; + const float& m21 = xfm.fMap[2][1]; + const float& m22 = xfm.fMap[2][2]; + const float& m23 = xfm.fMap[2][3]; + + // position + { + const float& srcX = pt_src[0]; + const float& srcY = pt_src[1]; + const float& srcZ = pt_src[2]; + + pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt; + pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt; + pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt; + } + + // normal + { + const float& srcX = vec_src[0]; + const float& srcY = vec_src[1]; + const float& srcZ = vec_src[2]; + + vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt; + vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt; + vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt; + } +} + #ifdef HS_SSE3 -#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ - __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \ - mc0 = _mm_load_ps(xfm.fMap[0]); \ - mc1 = _mm_load_ps(xfm.fMap[1]); \ - mc2 = _mm_load_ps(xfm.fMap[2]); \ - mwt = _mm_set_ps1(wgt); -#define MATRIXMULTBUFADD_SSE3(dst, src) \ - msr = _mm_load_ps(src); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf1 = _mm_hadd_ps(_x, _y); \ - hbuf2 = _mm_hadd_ps(_z, _z); \ - hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _dst = _mm_load_ps(dst); \ - _dst = _mm_add_ps(_dst, hbuf1); \ - _mm_store_ps(dst, _dst); -#define MATRIXMULTVECTORADD_SSE3(dst, src) \ - msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf1 = _mm_hadd_ps(_x, _y); \ - hbuf2 = _mm_hadd_ps(_z, _z); \ - hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - { \ - ALIGN(16) float hack[4]; \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; \ - } -#endif +static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0, + const __m128& mc1, const __m128& mc2, const __m128& mwt) +{ + __m128 msr = _mm_load_ps(src); + __m128 _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); + __m128 _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); + __m128 _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); -// CPU-optimized functions requiring dispatch -hsFunctionDispatcher plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3); - -// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication -#define BLENDVERTSTART \ - ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \ - ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \ - ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \ - hsPoint3* pt = reinterpret_cast(pt_buf); \ - hsPoint3* destPt = reinterpret_cast(destPt_buf); \ - hsVector3* vec = reinterpret_cast(vec_buf); \ - hsVector3* destNorm = reinterpret_cast(destNorm_buf); \ - \ - uint8_t numUVs, numWeights; \ - uint32_t i, j, indices, color, specColor, uvChanSize; \ - float weights[ 4 ], weightSum; \ - \ - /* Get some counts */\ - switch( format & plGBufferGroup::kSkinWeightMask ) \ - { \ - case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \ - case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \ - case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \ - default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \ - } \ - \ - numUVs = plGBufferGroup::CalcNumUVs( format ); \ - uvChanSize = numUVs * sizeof( float ) * 3; \ - \ - /* localUVWChans is bump mapping tangent space vectors, which need to - // be skinned like the normal, as opposed to passed through like - // garden variety UVW coordinates. - // There are no localUVWChans that I know of in production assets (i.e. - // the avatar is not skinned).*/\ - if( !localUVWChans ) \ - { \ - /* Copy whilst blending */\ - for( i = 0; i < count; i++ ) \ - { \ - /* Extract data */\ - src = inlExtractPoint( src, pt ); \ - for( j = 0, weightSum = 0; j < numWeights; j++ ) \ - { \ - src = inlExtractFloat( src, weights[ j ] ); \ - weightSum += weights[ j ]; \ - } \ - weights[ j ] = 1 - weightSum; \ - \ - if( format & plGBufferGroup::kSkinIndices ) \ - { \ - src = inlExtractUInt32( src, indices ); \ - } \ - else \ - { \ - indices = 1 << 8; \ - } \ - src = inlExtractPoint( src, vec ); \ - src = inlExtractUInt32( src, color ); \ - src = inlExtractUInt32( src, specColor ); \ - \ - /* Blend */\ - destPt->Set(0.f, 0.f, 0.f); \ - destPt_buf[3] = 1.f; \ - destNorm->Set(0.f, 0.f, 0.f); \ - for( j = 0; j < numWeights + 1; j++ ) \ - { \ - if( weights[ j ] ) \ - { - /* - MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD(destPt, pt); - MATRIXMULTVECTORADD(destNorm, vec); - */ -#define BLENDVERTMID \ - } \ - \ - indices >>= 8; \ - } \ - /* Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth".*/\ - /* hsFastMath::NormalizeAppr(destNorm);*/ \ - \ - /* Slam data into position now */\ - dest = inlStuffPoint( dest, destPt ); \ - dest = inlStuffPoint( dest, destNorm ); \ - dest = inlStuffUInt32( dest, color ); \ - dest = inlStuffUInt32( dest, specColor ); \ - memcpy( dest, src, uvChanSize ); \ - src += uvChanSize; \ - dest += uvChanSize; \ - } \ - } \ - else \ - { \ - uint8_t hiChan = localUVWChans >> 8; \ - uint8_t loChan = localUVWChans & 0xff; \ - /* Copy whilst blending */\ - for( i = 0; i < count; i++ ) \ - { \ - hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \ - hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \ - \ - /* Extract data */\ - src = inlExtractPoint( src, pt ); \ - for( j = 0, weightSum = 0; j < numWeights; j++ ) \ - { \ - src = inlExtractFloat( src, weights[ j ] ); \ - weightSum += weights[ j ]; \ - } \ - weights[ j ] = 1 - weightSum; \ - \ - if( format & plGBufferGroup::kSkinIndices ) \ - { \ - src = inlExtractUInt32( src, indices ); \ - } \ - else \ - { \ - indices = 1 << 8; \ - } \ - \ - src = inlExtractPoint( src, vec ); \ - src = inlExtractUInt32( src, color ); \ - src = inlExtractUInt32( src, specColor ); \ - \ - uint8_t k; \ - for( k = 0; k < numUVs; k++ ) \ - { \ - src = inlExtractPoint( src, &srcUVWs[k] ); \ - } \ - memcpy( dstUVWs, srcUVWs, uvChanSize); \ - dstUVWs[loChan].Set(0,0,0); \ - dstUVWs[hiChan].Set(0,0,0); \ - \ - /* Blend */\ - destPt->Set(0.f, 0.f, 0.f); \ - destPt_buf[3] = 1.f; \ - destNorm->Set(0.f, 0.f, 0.f); \ - for( j = 0; j < numWeights + 1; j++ ) \ - { \ - if( weights[ j ] ) \ - { \ - /* - MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD(destPt, pt); - MATRIXMULTVECTORADD(destNorm, vec); - MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]); - */ -#define BLENDVERTEND \ - } \ - \ - indices >>= 8; \ - } \ - /* Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". */\ - /* hsFastMath::NormalizeAppr(destNorm); */\ - /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\ - /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\ - \ - /* Slam data into position now */\ - dest = inlStuffPoint( dest, destPt ); \ - dest = inlStuffPoint( dest, destNorm ); \ - dest = inlStuffUInt32( dest, color ); \ - dest = inlStuffUInt32( dest, specColor ); \ - memcpy( dest, dstUVWs, uvChanSize ); \ - dest += uvChanSize; \ - } \ - } - -void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) -{ - BLENDVERTSTART - MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); - MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); - BLENDVERTMID - MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); - MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); - MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); - - BLENDVERTEND -} - -void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) + __m128 hbuf1 = _mm_hadd_ps(_x, _y); + __m128 hbuf2 = _mm_hadd_ps(_z, _z); + hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); + __m128 _dst = _mm_load_ps(dst); + _dst = _mm_add_ps(_dst, hbuf1); + _mm_store_ps(dst, _dst); +} +#endif // HS_SSE3 + +static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) { #ifdef HS_SSE3 - BLENDVERTSTART - MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); - BLENDVERTMID - MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); - MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); - BLENDVERTEND + __m128 mc0 = _mm_load_ps(xfm.fMap[0]); + __m128 mc1 = _mm_load_ps(xfm.fMap[1]); + __m128 mc2 = _mm_load_ps(xfm.fMap[2]); + __m128 mwt = _mm_set_ps1(wgt); + + ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt); + ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt); #endif // HS_SSE3 } +typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*); + +template +static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices, + const uint8_t* src, uint8_t format, uint32_t srcStride, + uint8_t* dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans) +{ + ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; + ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; + ALIGN(16) float destPt_buf[4], destNorm_buf[4]; + hsPoint3* pt = reinterpret_cast(pt_buf); + hsPoint3* destPt = reinterpret_cast(destPt_buf); + hsVector3* vec = reinterpret_cast(vec_buf); + hsVector3* destNorm = reinterpret_cast(destNorm_buf); + + uint8_t numUVs; + uint32_t indices, color, specColor, uvChanSize; + float weights[4]; + + numUVs = plGBufferGroup::CalcNumUVs(format); + uvChanSize = numUVs * sizeof(float) * 3; + uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4; + + // Dropped support for localUVWChans at templatization of code + hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?"); + + for (uint32_t i = 0; i < count; ++i) { + // Extract data + src = inlExtractPoint( src, pt ); + + float weightSum = 0.f; + for (uint8_t j = 0; j < numWeights; ++j) { + src = inlExtractFloat(src, weights[j]); + weightSum += weights[j]; + } + weights[numWeights] = 1.f - weightSum; + + if (format & plGBufferGroup::kSkinIndices) + src = inlExtractUInt32( src, indices ); + else + indices = 1 << 8; + src = inlExtractPoint( src, vec ); + src = inlExtractUInt32( src, color ); + src = inlExtractUInt32( src, specColor ); + + // Blend + destPt->Set(0.f, 0.f, 0.f); + destPt_buf[3] = 1.f; + destNorm->Set(0.f, 0.f, 0.f); + destNorm_buf[3] = 0.f; + for (uint32_t j = 0; j < numWeights + 1; ++j) { + if (weights[j]) + T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf); + indices >>= 8; + } + // Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth". + /* hsFastMath::NormalizeAppr(destNorm); */ + + // Slam data into position now + dest = inlStuffPoint( dest, destPt ); + dest = inlStuffPoint( dest, destNorm ); + dest = inlStuffUInt32( dest, color ); + dest = inlStuffUInt32( dest, specColor ); + memcpy( dest, src, uvChanSize ); + src += uvChanSize; + dest += uvChanSize; + } +} + +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher plDXPipeline::blend_vert_buffer( + IBlendVertBuffer, 0, 0, IBlendVertBuffer); + // ISetPipeConsts ////////////////////////////////////////////////////////////////// // A shader can request that the pipeline fill in certain constants that are indeterminate // until the pipeline is about to render the object the shader is applied to. For example, diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h index b14cc03c..0cc8dacf 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h @@ -804,8 +804,6 @@ public: // CPU-optimized functions protected: typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); - static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); - static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); static hsFunctionDispatcher blend_vert_buffer; };