diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index 413e98db..9bc0373f 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -10525,17 +10525,76 @@ void plDXPipeline::LoadResources() plNetClientApp::StaticDebugMsg("End Device Reload"); } -// Sorry about this, but it really did speed up the skinning. -// Just some macros for the inner loop of IBlendVertsIntoBuffer. +// inlTESTPOINT ///////////////////////////////////////// +// Update mins and maxs if destP is outside. +inline void inlTESTPOINT(const hsPoint3& destP, + float& minX, float& minY, float& minZ, + float& maxX, float& maxY, float& maxZ) +{ + if( destP.fX < minX ) + minX = destP.fX; + else if( destP.fX > maxX ) + maxX = destP.fX; + + if( destP.fY < minY ) + minY = destP.fY; + else if( destP.fY > maxY ) + maxY = destP.fY; + + if( destP.fZ < minZ ) + minZ = destP.fZ; + else if( destP.fZ > maxZ ) + maxZ = destP.fZ; +} + +//// IBlendVertsIntoBuffer //////////////////////////////////////////////////// +// Given a pointer into a buffer of verts that have blending data in the D3D +// format, blends them into the destination buffer given without the blending +// info. + +// FPU version +#define MATRIXMULTBEGIN_FPU(xfm, wgt) \ + float m00 = xfm.fMap[0][0]; \ + float m01 = xfm.fMap[0][1]; \ + float m02 = xfm.fMap[0][2]; \ + float m03 = xfm.fMap[0][3]; \ + float m10 = xfm.fMap[1][0]; \ + float m11 = xfm.fMap[1][1]; \ + float m12 = xfm.fMap[1][2]; \ + float m13 = xfm.fMap[1][3]; \ + float m20 = xfm.fMap[2][0]; \ + float m21 = xfm.fMap[2][1]; \ + float m22 = xfm.fMap[2][2]; \ + float m23 = xfm.fMap[2][3]; \ + float m_wgt = wgt; \ + float srcX, srcY, srcZ; +#define MATRIXMULTPOINTADD_FPU(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; +#define MATRIXMULTVECTORADD_FPU(dst, src) \ + srcX = src.fX; \ + srcY = src.fY; \ + srcZ = src.fZ; \ + \ + dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ + dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ + dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; + +// SSE3 version #ifdef HS_SSE3 -# define MATRIXMULTBEGIN(xfm, wgt) \ +#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ ALIGN(16) float hack[4]; \ mc0 = _mm_loadu_ps(xfm.fMap[0]); \ mc1 = _mm_loadu_ps(xfm.fMap[1]); \ mc2 = _mm_loadu_ps(xfm.fMap[2]); \ mwt = _mm_set_ps1(wgt); -# define MATRIXMULTPOINTADD(dst, src) \ +#define MATRIXMULTPOINTADD_SSE3(dst, src) \ msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ @@ -10548,7 +10607,7 @@ void plDXPipeline::LoadResources() dst.fX += hack[0]; \ dst.fY += hack[1]; \ dst.fZ += hack[2]; -# define MATRIXMULTVECTORADD(dst, src) \ +#define MATRIXMULTVECTORADD_SSE3(dst, src) \ msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ @@ -10561,250 +10620,214 @@ void plDXPipeline::LoadResources() dst.fX += hack[0]; \ dst.fY += hack[1]; \ dst.fZ += hack[2]; -#else -# define MATRIXMULTBEGIN(xfm, wgt) \ - float m00 = xfm.fMap[0][0]; \ - float m01 = xfm.fMap[0][1]; \ - float m02 = xfm.fMap[0][2]; \ - float m03 = xfm.fMap[0][3]; \ - float m10 = xfm.fMap[1][0]; \ - float m11 = xfm.fMap[1][1]; \ - float m12 = xfm.fMap[1][2]; \ - float m13 = xfm.fMap[1][3]; \ - float m20 = xfm.fMap[2][0]; \ - float m21 = xfm.fMap[2][1]; \ - float m22 = xfm.fMap[2][2]; \ - float m23 = xfm.fMap[2][3]; \ - float m_wgt = wgt; \ - float srcX, srcY, srcZ; -# define MATRIXMULTPOINTADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; -# define MATRIXMULTVECTORADD(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; -#endif // HAVE_SSE - -// inlTESTPOINT ///////////////////////////////////////// -// Update mins and maxs if destP is outside. -inline void inlTESTPOINT(const hsPoint3& destP, - float& minX, float& minY, float& minZ, - float& maxX, float& maxY, float& maxZ) -{ - if( destP.fX < minX ) - minX = destP.fX; - else if( destP.fX > maxX ) - maxX = destP.fX; - - if( destP.fY < minY ) - minY = destP.fY; - else if( destP.fY > maxY ) - maxY = destP.fY; - - if( destP.fZ < minZ ) - minZ = destP.fZ; - else if( destP.fZ > maxZ ) - maxZ = destP.fZ; -} - -//// IBlendVertsIntoBuffer //////////////////////////////////////////////////// -// Given a pointer into a buffer of verts that have blending data in the D3D -// format, blends them into the destination buffer given without the blending -// info. - -void plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) -{ - uint8_t numUVs, numWeights; - uint32_t i, j, indices, color, specColor, uvChanSize; - float weights[ 4 ], weightSum; - hsPoint3 pt, tempPt, destPt; - hsVector3 vec, tempNorm, destNorm; - - - /// Get some counts - switch( format & plGBufferGroup::kSkinWeightMask ) - { - case plGBufferGroup::kSkin1Weight: numWeights = 1; break; - case plGBufferGroup::kSkin2Weights: numWeights = 2; break; - case plGBufferGroup::kSkin3Weights: numWeights = 3; break; - default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); - } - - numUVs = plGBufferGroup::CalcNumUVs( format ); - uvChanSize = numUVs * sizeof( float ) * 3; - -//#define MF_RECALC_BOUNDS -#ifdef MF_RECALC_BOUNDS - float minX = 1.e33f; - float minY = 1.e33f; - float minZ = 1.e33f; - - float maxX = -1.e33f; - float maxY = -1.e33f; - float maxZ = -1.e33f; -#endif // MF_RECALC_BOUNDS +#endif - // localUVWChans is bump mapping tangent space vectors, which need to +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3); + +// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication +#define BLENDVERTSTART \ + uint8_t numUVs, numWeights; \ + uint32_t i, j, indices, color, specColor, uvChanSize; \ + float weights[ 4 ], weightSum; \ + hsPoint3 pt, tempPt, destPt; \ + hsVector3 vec, tempNorm, destNorm; \ + \ + /* Get some counts */\ + switch( format & plGBufferGroup::kSkinWeightMask ) \ + { \ + case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \ + case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \ + case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \ + default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \ + } \ + \ + numUVs = plGBufferGroup::CalcNumUVs( format ); \ + uvChanSize = numUVs * sizeof( float ) * 3; \ + \ + /* localUVWChans is bump mapping tangent space vectors, which need to // be skinned like the normal, as opposed to passed through like // garden variety UVW coordinates. // There are no localUVWChans that I know of in production assets (i.e. - // the avatar is not skinned). - if( !localUVWChans ) - { - /// Copy whilst blending - for( i = 0; i < count; i++ ) - { - // Extract data - src = inlExtractPoint( src, pt ); - for( j = 0, weightSum = 0; j < numWeights; j++ ) - { - src = inlExtractFloat( src, weights[ j ] ); - weightSum += weights[ j ]; - } - weights[ j ] = 1 - weightSum; - - if( format & plGBufferGroup::kSkinIndices ) - { - src = inlExtractUInt32( src, indices ); - } - else - { - indices = 1 << 8; - } - src = inlExtractPoint( src, vec ); - src = inlExtractUInt32( src, color ); - src = inlExtractUInt32( src, specColor ); - - // Blend - destPt.Set( 0, 0, 0 ); - destNorm.Set( 0, 0, 0 ); - for( j = 0; j < numWeights + 1; j++ ) - { - if( weights[ j ] ) + // the avatar is not skinned).*/\ + if( !localUVWChans ) \ + { \ + /* Copy whilst blending */\ + for( i = 0; i < count; i++ ) \ + { \ + /* Extract data */\ + src = inlExtractPoint( src, pt ); \ + for( j = 0, weightSum = 0; j < numWeights; j++ ) \ + { \ + src = inlExtractFloat( src, weights[ j ] ); \ + weightSum += weights[ j ]; \ + } \ + weights[ j ] = 1 - weightSum; \ + \ + if( format & plGBufferGroup::kSkinIndices ) \ + { \ + src = inlExtractUInt32( src, indices ); \ + } \ + else \ + { \ + indices = 1 << 8; \ + } \ + src = inlExtractPoint( src, vec ); \ + src = inlExtractUInt32( src, color ); \ + src = inlExtractUInt32( src, specColor ); \ + \ + /* Blend */\ + destPt.Set( 0, 0, 0 ); \ + destNorm.Set( 0, 0, 0 ); \ + for( j = 0; j < numWeights + 1; j++ ) \ + { \ + if( weights[ j ] ) \ { + /* MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTVECTORADD(destNorm, vec); - } - - indices >>= 8; - } - // Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". -// hsFastMath::NormalizeAppr(destNorm); - -#ifdef MF_RECALC_BOUNDS - inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); -#endif // MF_RECALC_BOUNDS - - // Slam data into position now - dest = inlStuffPoint( dest, destPt ); - dest = inlStuffPoint( dest, destNorm ); - dest = inlStuffUInt32( dest, color ); - dest = inlStuffUInt32( dest, specColor ); - memcpy( dest, src, uvChanSize ); - src += uvChanSize; - dest += uvChanSize; - } - } - else - { - uint8_t hiChan = localUVWChans >> 8; - uint8_t loChan = localUVWChans & 0xff; - /// Copy whilst blending - for( i = 0; i < count; i++ ) - { - hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; - hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; - - // Extract data - src = inlExtractPoint( src, pt ); - for( j = 0, weightSum = 0; j < numWeights; j++ ) - { - src = inlExtractFloat( src, weights[ j ] ); - weightSum += weights[ j ]; - } - weights[ j ] = 1 - weightSum; - - if( format & plGBufferGroup::kSkinIndices ) - { - src = inlExtractUInt32( src, indices ); - } - else - { - indices = 1 << 8; - } - - src = inlExtractPoint( src, vec ); - src = inlExtractUInt32( src, color ); - src = inlExtractUInt32( src, specColor ); - - uint8_t k; - for( k = 0; k < numUVs; k++ ) - { - src = inlExtractPoint( src, srcUVWs[k] ); - } - memcpy( dstUVWs, srcUVWs, uvChanSize); - dstUVWs[loChan].Set(0,0,0); - dstUVWs[hiChan].Set(0,0,0); - - // Blend - destPt.Set( 0, 0, 0 ); - destNorm.Set( 0, 0, 0 ); - for( j = 0; j < numWeights + 1; j++ ) - { - if( weights[ j ] ) - { + */ +#define BLENDVERTMID \ + } \ + \ + indices >>= 8; \ + } \ + /* Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth".*/\ + /* hsFastMath::NormalizeAppr(destNorm);*/ \ + \ + /* Slam data into position now */\ + dest = inlStuffPoint( dest, destPt ); \ + dest = inlStuffPoint( dest, destNorm ); \ + dest = inlStuffUInt32( dest, color ); \ + dest = inlStuffUInt32( dest, specColor ); \ + memcpy( dest, src, uvChanSize ); \ + src += uvChanSize; \ + dest += uvChanSize; \ + } \ + } \ + else \ + { \ + uint8_t hiChan = localUVWChans >> 8; \ + uint8_t loChan = localUVWChans & 0xff; \ + /* Copy whilst blending */\ + for( i = 0; i < count; i++ ) \ + { \ + hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \ + hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \ + \ + /* Extract data */\ + src = inlExtractPoint( src, pt ); \ + for( j = 0, weightSum = 0; j < numWeights; j++ ) \ + { \ + src = inlExtractFloat( src, weights[ j ] ); \ + weightSum += weights[ j ]; \ + } \ + weights[ j ] = 1 - weightSum; \ + \ + if( format & plGBufferGroup::kSkinIndices ) \ + { \ + src = inlExtractUInt32( src, indices ); \ + } \ + else \ + { \ + indices = 1 << 8; \ + } \ + \ + src = inlExtractPoint( src, vec ); \ + src = inlExtractUInt32( src, color ); \ + src = inlExtractUInt32( src, specColor ); \ + \ + uint8_t k; \ + for( k = 0; k < numUVs; k++ ) \ + { \ + src = inlExtractPoint( src, srcUVWs[k] ); \ + } \ + memcpy( dstUVWs, srcUVWs, uvChanSize); \ + dstUVWs[loChan].Set(0,0,0); \ + dstUVWs[hiChan].Set(0,0,0); \ + \ + /* Blend */\ + destPt.Set( 0, 0, 0 ); \ + destNorm.Set( 0, 0, 0 ); \ + for( j = 0; j < numWeights + 1; j++ ) \ + { \ + if( weights[ j ] ) \ + { \ + /* MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTVECTORADD(destNorm, vec); MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]); - } - - indices >>= 8; - } - // Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". -// hsFastMath::NormalizeAppr(destNorm); -// hsFastMath::NormalizeAppr(dstUVWs[loChan]); -// hsFastMath::NormalizeAppr(dstUVWs[hiChan]); - -#ifdef MF_RECALC_BOUNDS - inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); -#endif // MF_RECALC_BOUNDS - - // Slam data into position now - dest = inlStuffPoint( dest, destPt ); - dest = inlStuffPoint( dest, destNorm ); - dest = inlStuffUInt32( dest, color ); - dest = inlStuffUInt32( dest, specColor ); - memcpy( dest, dstUVWs, uvChanSize ); - dest += uvChanSize; - } - } -#ifdef MF_RECALC_BOUNDS - hsBounds3Ext wBnd; - wBnd.Reset(&hsPoint3(minX, minY, minZ)); - wBnd.Union(&hsPoint3(maxX, maxY, maxZ)); - span->fWorldBounds = wBnd; -#endif // MF_RECALC_BOUNDS + */ +#define BLENDVERTEND \ + } \ + \ + indices >>= 8; \ + } \ + /* Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth". */\ + /* hsFastMath::NormalizeAppr(destNorm); */\ + /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\ + /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\ + \ + /* Slam data into position now */\ + dest = inlStuffPoint( dest, destPt ); \ + dest = inlStuffPoint( dest, destNorm ); \ + dest = inlStuffUInt32( dest, color ); \ + dest = inlStuffUInt32( dest, specColor ); \ + memcpy( dest, dstUVWs, uvChanSize ); \ + dest += uvChanSize; \ + } \ + } + +void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, + hsMatrix44* matrixPalette, int numMatrices, + const uint8_t *src, uint8_t format, uint32_t srcStride, + uint8_t *dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans ) +{ + BLENDVERTSTART + MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_FPU(destPt, pt); + MATRIXMULTVECTORADD_FPU(destNorm, vec); + BLENDVERTMID + MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_FPU(destPt, pt); + MATRIXMULTVECTORADD_FPU(destNorm, vec); + MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); + MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); + + BLENDVERTEND +} + +void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, + hsMatrix44* matrixPalette, int numMatrices, + const uint8_t *src, uint8_t format, uint32_t srcStride, + uint8_t *dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans ) +{ +#ifdef HS_SSE3 + BLENDVERTSTART + MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_SSE3(destPt, pt); + MATRIXMULTVECTORADD_SSE3(destNorm, vec); + BLENDVERTMID + MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_SSE3(destPt, pt); + MATRIXMULTVECTORADD_SSE3(destNorm, vec); + MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); + MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); + BLENDVERTEND +#endif // HS_SSE3 } // ISetPipeConsts ////////////////////////////////////////////////////////////////// diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h index cdc4e2e4..1d83fce9 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h @@ -465,7 +465,8 @@ protected: void IBlendVertsIntoBuffer( plSpan* span, hsMatrix44* matrixPalette, int numMatrices, const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans ); + uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans ) + { blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); }; hsBool ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray& visList ); @@ -734,7 +735,7 @@ public: virtual void GetDepth(float& hither, float& yon) const; virtual void SetDepth(float hither, float yon); - virtual float GetZBiasScale() const; + virtual float GetZBiasScale() const; virtual void SetZBiasScale(float scale); virtual const hsMatrix44& GetWorldToCamera() const; @@ -798,6 +799,13 @@ public: virtual int GetMaxAnisotropicSamples(); virtual int GetMaxAntiAlias(int Width, int Height, int ColorDepth); + + // CPU-optimized functions +protected: + typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static hsFunctionDispatcher blend_vert_buffer; };