diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index d54fe5c9..d3e878ea 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -10699,6 +10699,38 @@ static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt, #endif // HS_SSE3 } +#ifdef HS_SSE41 +static inline void ISkinDpSSE41(const float* src, float* dst, const __m128& mc0, + const __m128& mc1, const __m128& mc2, const __m128& mwt) +{ + enum { DP_F4_X = 0xF1, DP_F4_Y = 0xF2, DP_F4_Z = 0xF4 }; + + __m128 msr = _mm_load_ps(src); + __m128 _r = _mm_dp_ps(msr, mc0, DP_F4_X); + _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc1, DP_F4_Y)); + _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc2, DP_F4_Z)); + + __m128 _dst = _mm_load_ps(dst); + _dst = _mm_add_ps(_dst, _mm_mul_ps(_r, mwt)); + _mm_store_ps(dst, _dst); +} +#endif // HS_SSE41 + +static inline void ISkinVertexSSE41(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) +{ +#ifdef HS_SSE41 + __m128 mc0 = _mm_load_ps(xfm.fMap[0]); + __m128 mc1 = _mm_load_ps(xfm.fMap[1]); + __m128 mc2 = _mm_load_ps(xfm.fMap[2]); + __m128 mwt = _mm_set_ps1(wgt); + + ISkinDpSSE41(pt_src, pt_dst, mc0, mc1, mc2, mwt); + ISkinDpSSE41(vec_src, vec_dst, mc0, mc1, mc2, mwt); +#endif // HS_SSE41 +} + typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*); template @@ -10763,7 +10795,8 @@ static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMat // CPU-optimized functions requiring dispatch hsFunctionDispatcher plDXPipeline::blend_vert_buffer( - IBlendVertBuffer, 0, 0, IBlendVertBuffer); + IBlendVertBuffer, 0, 0, IBlendVertBuffer, 0, + IBlendVertBuffer); // ISetPipeConsts ////////////////////////////////////////////////////////////////// // A shader can request that the pipeline fill in certain constants that are indeterminate