Browse Source

SSE 4.1 Skinning

This appears to offer no benefit on my Wolfdale CPU, but I expect it will
be useful on more recent ones.
Adam Johnson 12 years ago
parent
commit
ea31db3305
  1. 35
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

35
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -10699,6 +10699,38 @@ static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt,
#endif // HS_SSE3 #endif // HS_SSE3
} }
#ifdef HS_SSE41
static inline void ISkinDpSSE41(const float* src, float* dst, const __m128& mc0,
const __m128& mc1, const __m128& mc2, const __m128& mwt)
{
enum { DP_F4_X = 0xF1, DP_F4_Y = 0xF2, DP_F4_Z = 0xF4 };
__m128 msr = _mm_load_ps(src);
__m128 _r = _mm_dp_ps(msr, mc0, DP_F4_X);
_r = _mm_or_ps(_r, _mm_dp_ps(msr, mc1, DP_F4_Y));
_r = _mm_or_ps(_r, _mm_dp_ps(msr, mc2, DP_F4_Z));
__m128 _dst = _mm_load_ps(dst);
_dst = _mm_add_ps(_dst, _mm_mul_ps(_r, mwt));
_mm_store_ps(dst, _dst);
}
#endif // HS_SSE41
static inline void ISkinVertexSSE41(const hsMatrix44& xfm, float wgt,
const float* pt_src, float* pt_dst,
const float* vec_src, float* vec_dst)
{
#ifdef HS_SSE41
__m128 mc0 = _mm_load_ps(xfm.fMap[0]);
__m128 mc1 = _mm_load_ps(xfm.fMap[1]);
__m128 mc2 = _mm_load_ps(xfm.fMap[2]);
__m128 mwt = _mm_set_ps1(wgt);
ISkinDpSSE41(pt_src, pt_dst, mc0, mc1, mc2, mwt);
ISkinDpSSE41(vec_src, vec_dst, mc0, mc1, mc2, mwt);
#endif // HS_SSE41
}
typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*); typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*);
template<skin_vert_ptr T> template<skin_vert_ptr T>
@ -10763,7 +10795,8 @@ static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMat
// CPU-optimized functions requiring dispatch // CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer( hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>); IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>, 0,
IBlendVertBuffer<ISkinVertexSSE41>);
// ISetPipeConsts ////////////////////////////////////////////////////////////////// // ISetPipeConsts //////////////////////////////////////////////////////////////////
// A shader can request that the pipeline fill in certain constants that are indeterminate // A shader can request that the pipeline fill in certain constants that are indeterminate

Loading…
Cancel
Save