diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index 412da2c1..d4ad28d4 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -203,10 +203,10 @@ void plReleaseObject(IUnknown* x) //// Local Static Stuff /////////////////////////////////////////////////////// /// Macros for getting/setting data in a D3D vertex buffer -inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple& point ) +inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point ) { register float* dst = (float*)ptr; - register const float* src = (float*)&point.fX; + register const float* src = (float*)&point->fX; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; @@ -10617,8 +10617,7 @@ inline void inlTESTPOINT(const hsPoint3& destP, // SSE3 version #ifdef HS_SSE3 #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ - __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ - ALIGN(16) float hack[4]; \ + __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \ mc0 = _mm_load_ps(xfm.fMap[0]); \ mc1 = _mm_load_ps(xfm.fMap[1]); \ mc2 = _mm_load_ps(xfm.fMap[2]); \ @@ -10632,10 +10631,9 @@ inline void inlTESTPOINT(const hsPoint3& destP, hbuf1 = _mm_hadd_ps(_x, _y); \ hbuf2 = _mm_hadd_ps(_z, _z); \ hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; + _dst = _mm_load_ps(dst); \ + _dst = _mm_add_ps(_dst, hbuf1); \ + _mm_store_ps(dst, _dst); #define MATRIXMULTVECTORADD_SSE3(dst, src) \ msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ @@ -10645,10 +10643,13 @@ inline void inlTESTPOINT(const hsPoint3& destP, hbuf1 = _mm_hadd_ps(_x, _y); \ hbuf2 = _mm_hadd_ps(_z, _z); \ hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; + { \ + ALIGN(16) float hack[4]; \ + _mm_store_ps(hack, hbuf1); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + dst.fZ += hack[2]; \ + } #endif // CPU-optimized functions requiring dispatch @@ -10656,15 +10657,17 @@ hsFunctionDispatcher plDXPipeline::blend_ve // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication #define BLENDVERTSTART \ - uint8_t numUVs, numWeights; \ - uint32_t i, j, indices, color, specColor, uvChanSize; \ - float weights[ 4 ], weightSum; \ ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \ ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \ + ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \ hsPoint3* pt = reinterpret_cast(pt_buf); \ + hsPoint3* destPt = reinterpret_cast(destPt_buf); \ hsVector3* vec = reinterpret_cast(vec_buf); \ - hsPoint3 destPt; \ - hsVector3 destNorm; \ + hsVector3* destNorm = reinterpret_cast(destNorm_buf); \ + \ + uint8_t numUVs, numWeights; \ + uint32_t i, j, indices, color, specColor, uvChanSize; \ + float weights[ 4 ], weightSum; \ \ /* Get some counts */\ switch( format & plGBufferGroup::kSkinWeightMask ) \ @@ -10710,8 +10713,9 @@ hsFunctionDispatcher plDXPipeline::blend_ve src = inlExtractUInt32( src, specColor ); \ \ /* Blend */\ - destPt.Set( 0, 0, 0 ); \ - destNorm.Set( 0, 0, 0 ); \ + destPt->Set(0.f, 0.f, 0.f); \ + destPt_buf[3] = 1.f; \ + destNorm->Set(0.f, 0.f, 0.f); \ for( j = 0; j < numWeights + 1; j++ ) \ { \ if( weights[ j ] ) \ @@ -10783,8 +10787,9 @@ hsFunctionDispatcher plDXPipeline::blend_ve dstUVWs[hiChan].Set(0,0,0); \ \ /* Blend */\ - destPt.Set( 0, 0, 0 ); \ - destNorm.Set( 0, 0, 0 ); \ + destPt->Set(0.f, 0.f, 0.f); \ + destPt_buf[3] = 1.f; \ + destNorm->Set(0.f, 0.f, 0.f); \ for( j = 0; j < numWeights + 1; j++ ) \ { \ if( weights[ j ] ) \ @@ -10827,13 +10832,13 @@ void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, BLENDVERTSTART MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_FPU(destPt, (*pt)); - MATRIXMULTVECTORADD_FPU(destNorm, (*vec)); + MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); + MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); BLENDVERTMID MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_FPU(destPt, (*pt)); - MATRIXMULTVECTORADD_FPU(destNorm, (*vec)); + MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); + MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); @@ -10850,13 +10855,13 @@ void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, BLENDVERTSTART MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTBUFADD_SSE3(destPt, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm, vec_buf); + MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); + MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); BLENDVERTMID MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTBUFADD_SSE3(destPt, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm, vec_buf); + MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); + MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); BLENDVERTEND