diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index d4ad28d4..d3e878ea 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -203,40 +203,60 @@ void plReleaseObject(IUnknown* x) //// Local Static Stuff /////////////////////////////////////////////////////// /// Macros for getting/setting data in a D3D vertex buffer -inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point ) +template +static inline void inlCopy(uint8_t*& src, uint8_t*& dst) { - register float* dst = (float*)ptr; - register const float* src = (float*)&point->fX; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - return (uint8_t*)dst; + T* src_ptr = reinterpret_cast(src); + T* dst_ptr = reinterpret_cast(dst); + *dst_ptr = *src_ptr; + src += sizeof(T); + dst += sizeof(T); } -inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint ) + +template +static inline const uint8_t* inlExtract(const uint8_t* src, T* val) +{ + const T* ptr = reinterpret_cast(src); + *val = *ptr++; + return reinterpret_cast(ptr); +} + +template<> +static inline const uint8_t* inlExtract(const uint8_t* src, hsPoint3* val) { - *(uint32_t*)ptr = uint; - return ptr + sizeof(uint); + const float* src_ptr = reinterpret_cast(src); + float* dst_ptr = reinterpret_cast(val); + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = *src_ptr++; + *dst_ptr = 1.f; + return reinterpret_cast(src_ptr); } -inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt ) + +template<> +static inline const uint8_t* inlExtract(const uint8_t* src, hsVector3* val) { - register const float* src = (float*)ptr; - register float* dst = (float*)&pt->fX; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - return (uint8_t*)src; + const float* src_ptr = reinterpret_cast(src); + float* dst_ptr = reinterpret_cast(val); + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = *src_ptr++; + *dst_ptr = 0.f; + return reinterpret_cast(src_ptr); } -inline uint8_t* inlExtractFloat( const uint8_t*& ptr, float& f ) + +template +static inline void inlSkip(uint8_t*& src) { - register const float* src = (float*)ptr; - f = *src++; - return (uint8_t*)src; + src += sizeof(T) * N; } -inline uint8_t* inlExtractUInt32( const uint8_t*& ptr, uint32_t& uint ) + +template +static inline uint8_t* inlStuff(uint8_t* dst, const T* val) { - const uint32_t* src = (uint32_t*)ptr; - uint = *src++; - return (uint8_t*)src; + T* ptr = reinterpret_cast(dst); + *ptr++ = *val; + return reinterpret_cast(ptr); } inline DWORD F2DW( FLOAT f ) @@ -9960,6 +9980,30 @@ void plDXPipeline::IFillStaticVertexBufferRef(plDXVertexBufferRef *ref, plGBuffe ref->SetDirty(false); } +void plDXPipeline::IFillVolatileVertexBufferRef(plDXVertexBufferRef* ref, plGBufferGroup* group, uint32_t idx) +{ + uint8_t* dst = ref->fData; + uint8_t* src = group->GetVertBufferData(idx); + + size_t uvChanSize = plGBufferGroup::CalcNumUVs(group->GetVertexFormat()) * sizeof(float) * 3; + uint8_t numWeights = (group->GetVertexFormat() & plGBufferGroup::kSkinWeightMask) >> 4; + + for (uint32_t i = 0; i < ref->fCount; ++i) { + inlCopy(src, dst); // pre-pos + src += numWeights * sizeof(float); // weights + if (group->GetVertexFormat() & plGBufferGroup::kSkinIndices) + inlSkip(src); // indices + inlCopy(src, dst); // pre-normal + inlCopy(src, dst); // diffuse + inlCopy(src, dst); // specular + + // UVWs + memcpy(dst, src, uvChanSize); + src += uvChanSize; + dst += uvChanSize; + } +} + // OpenAccess //////////////////////////////////////////////////////////////////////////////////////// // Lock the managed buffer and setup the accessSpan to point into the buffers data. bool plDXPipeline::OpenAccess(plAccessSpan& dst, plDrawableSpans* drawable, const plVertexSpan* span, bool readOnly) @@ -10114,6 +10158,7 @@ void plDXPipeline::CheckVertexBufferRef(plGBufferGroup* owner, uint32_t idx) if( !vRef->fData && (vRef->fFormat != owner->GetVertexFormat()) ) { vRef->fData = new uint8_t[vRef->fCount * vRef->fVertexSize]; + IFillVolatileVertexBufferRef(vRef, owner, idx); } } } @@ -10581,293 +10626,178 @@ inline void inlTESTPOINT(const hsPoint3& destP, // format, blends them into the destination buffer given without the blending // info. -// FPU version -#define MATRIXMULTBEGIN_FPU(xfm, wgt) \ - float m00 = xfm.fMap[0][0]; \ - float m01 = xfm.fMap[0][1]; \ - float m02 = xfm.fMap[0][2]; \ - float m03 = xfm.fMap[0][3]; \ - float m10 = xfm.fMap[1][0]; \ - float m11 = xfm.fMap[1][1]; \ - float m12 = xfm.fMap[1][2]; \ - float m13 = xfm.fMap[1][3]; \ - float m20 = xfm.fMap[2][0]; \ - float m21 = xfm.fMap[2][1]; \ - float m22 = xfm.fMap[2][2]; \ - float m23 = xfm.fMap[2][3]; \ - float m_wgt = wgt; \ - float srcX, srcY, srcZ; -#define MATRIXMULTPOINTADD_FPU(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; -#define MATRIXMULTVECTORADD_FPU(dst, src) \ - srcX = src.fX; \ - srcY = src.fY; \ - srcZ = src.fZ; \ - \ - dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ - dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ - dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; - -// SSE3 version +static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) +{ + const float& m00 = xfm.fMap[0][0]; + const float& m01 = xfm.fMap[0][1]; + const float& m02 = xfm.fMap[0][2]; + const float& m03 = xfm.fMap[0][3]; + const float& m10 = xfm.fMap[1][0]; + const float& m11 = xfm.fMap[1][1]; + const float& m12 = xfm.fMap[1][2]; + const float& m13 = xfm.fMap[1][3]; + const float& m20 = xfm.fMap[2][0]; + const float& m21 = xfm.fMap[2][1]; + const float& m22 = xfm.fMap[2][2]; + const float& m23 = xfm.fMap[2][3]; + + // position + { + const float& srcX = pt_src[0]; + const float& srcY = pt_src[1]; + const float& srcZ = pt_src[2]; + + pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt; + pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt; + pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt; + } + + // normal + { + const float& srcX = vec_src[0]; + const float& srcY = vec_src[1]; + const float& srcZ = vec_src[2]; + + vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt; + vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt; + vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt; + } +} + #ifdef HS_SSE3 -#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ - __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \ - mc0 = _mm_load_ps(xfm.fMap[0]); \ - mc1 = _mm_load_ps(xfm.fMap[1]); \ - mc2 = _mm_load_ps(xfm.fMap[2]); \ - mwt = _mm_set_ps1(wgt); -#define MATRIXMULTBUFADD_SSE3(dst, src) \ - msr = _mm_load_ps(src); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf1 = _mm_hadd_ps(_x, _y); \ - hbuf2 = _mm_hadd_ps(_z, _z); \ - hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _dst = _mm_load_ps(dst); \ - _dst = _mm_add_ps(_dst, hbuf1); \ - _mm_store_ps(dst, _dst); -#define MATRIXMULTVECTORADD_SSE3(dst, src) \ - msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf1 = _mm_hadd_ps(_x, _y); \ - hbuf2 = _mm_hadd_ps(_z, _z); \ - hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - { \ - ALIGN(16) float hack[4]; \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; \ - } -#endif +static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0, + const __m128& mc1, const __m128& mc2, const __m128& mwt) +{ + __m128 msr = _mm_load_ps(src); + __m128 _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); + __m128 _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); + __m128 _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); -// CPU-optimized functions requiring dispatch -hsFunctionDispatcher plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3); - -// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication -#define BLENDVERTSTART \ - ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \ - ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \ - ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \ - hsPoint3* pt = reinterpret_cast(pt_buf); \ - hsPoint3* destPt = reinterpret_cast(destPt_buf); \ - hsVector3* vec = reinterpret_cast(vec_buf); \ - hsVector3* destNorm = reinterpret_cast(destNorm_buf); \ - \ - uint8_t numUVs, numWeights; \ - uint32_t i, j, indices, color, specColor, uvChanSize; \ - float weights[ 4 ], weightSum; \ - \ - /* Get some counts */\ - switch( format & plGBufferGroup::kSkinWeightMask ) \ - { \ - case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \ - case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \ - case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \ - default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \ - } \ - \ - numUVs = plGBufferGroup::CalcNumUVs( format ); \ - uvChanSize = numUVs * sizeof( float ) * 3; \ - \ - /* localUVWChans is bump mapping tangent space vectors, which need to - // be skinned like the normal, as opposed to passed through like - // garden variety UVW coordinates. - // There are no localUVWChans that I know of in production assets (i.e. - // the avatar is not skinned).*/\ - if( !localUVWChans ) \ - { \ - /* Copy whilst blending */\ - for( i = 0; i < count; i++ ) \ - { \ - /* Extract data */\ - src = inlExtractPoint( src, pt ); \ - for( j = 0, weightSum = 0; j < numWeights; j++ ) \ - { \ - src = inlExtractFloat( src, weights[ j ] ); \ - weightSum += weights[ j ]; \ - } \ - weights[ j ] = 1 - weightSum; \ - \ - if( format & plGBufferGroup::kSkinIndices ) \ - { \ - src = inlExtractUInt32( src, indices ); \ - } \ - else \ - { \ - indices = 1 << 8; \ - } \ - src = inlExtractPoint( src, vec ); \ - src = inlExtractUInt32( src, color ); \ - src = inlExtractUInt32( src, specColor ); \ - \ - /* Blend */\ - destPt->Set(0.f, 0.f, 0.f); \ - destPt_buf[3] = 1.f; \ - destNorm->Set(0.f, 0.f, 0.f); \ - for( j = 0; j < numWeights + 1; j++ ) \ - { \ - if( weights[ j ] ) \ - { - /* - MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD(destPt, pt); - MATRIXMULTVECTORADD(destNorm, vec); - */ -#define BLENDVERTMID \ - } \ - \ - indices >>= 8; \ - } \ - /* Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth".*/\ - /* hsFastMath::NormalizeAppr(destNorm);*/ \ - \ - /* Slam data into position now */\ - dest = inlStuffPoint( dest, destPt ); \ - dest = inlStuffPoint( dest, destNorm ); \ - dest = inlStuffUInt32( dest, color ); \ - dest = inlStuffUInt32( dest, specColor ); \ - memcpy( dest, src, uvChanSize ); \ - src += uvChanSize; \ - dest += uvChanSize; \ - } \ - } \ - else \ - { \ - uint8_t hiChan = localUVWChans >> 8; \ - uint8_t loChan = localUVWChans & 0xff; \ - /* Copy whilst blending */\ - for( i = 0; i < count; i++ ) \ - { \ - hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \ - hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \ - \ - /* Extract data */\ - src = inlExtractPoint( src, pt ); \ - for( j = 0, weightSum = 0; j < numWeights; j++ ) \ - { \ - src = inlExtractFloat( src, weights[ j ] ); \ - weightSum += weights[ j ]; \ - } \ - weights[ j ] = 1 - weightSum; \ - \ - if( format & plGBufferGroup::kSkinIndices ) \ - { \ - src = inlExtractUInt32( src, indices ); \ - } \ - else \ - { \ - indices = 1 << 8; \ - } \ - \ - src = inlExtractPoint( src, vec ); \ - src = inlExtractUInt32( src, color ); \ - src = inlExtractUInt32( src, specColor ); \ - \ - uint8_t k; \ - for( k = 0; k < numUVs; k++ ) \ - { \ - src = inlExtractPoint( src, &srcUVWs[k] ); \ - } \ - memcpy( dstUVWs, srcUVWs, uvChanSize); \ - dstUVWs[loChan].Set(0,0,0); \ - dstUVWs[hiChan].Set(0,0,0); \ - \ - /* Blend */\ - destPt->Set(0.f, 0.f, 0.f); \ - destPt_buf[3] = 1.f; \ - destNorm->Set(0.f, 0.f, 0.f); \ - for( j = 0; j < numWeights + 1; j++ ) \ - { \ - if( weights[ j ] ) \ - { \ - /* - MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD(destPt, pt); - MATRIXMULTVECTORADD(destNorm, vec); - MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]); - */ -#define BLENDVERTEND \ - } \ - \ - indices >>= 8; \ - } \ - /* Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". */\ - /* hsFastMath::NormalizeAppr(destNorm); */\ - /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\ - /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\ - \ - /* Slam data into position now */\ - dest = inlStuffPoint( dest, destPt ); \ - dest = inlStuffPoint( dest, destNorm ); \ - dest = inlStuffUInt32( dest, color ); \ - dest = inlStuffUInt32( dest, specColor ); \ - memcpy( dest, dstUVWs, uvChanSize ); \ - dest += uvChanSize; \ - } \ - } - -void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) -{ - BLENDVERTSTART - MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); - MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); - BLENDVERTMID - MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); - MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); - MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); - - BLENDVERTEND -} - -void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) + __m128 hbuf1 = _mm_hadd_ps(_x, _y); + __m128 hbuf2 = _mm_hadd_ps(_z, _z); + hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); + __m128 _dst = _mm_load_ps(dst); + _dst = _mm_add_ps(_dst, hbuf1); + _mm_store_ps(dst, _dst); +} +#endif // HS_SSE3 + +static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) { #ifdef HS_SSE3 - BLENDVERTSTART - MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); - BLENDVERTMID - MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - - MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); - MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); - MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); - MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); - BLENDVERTEND + __m128 mc0 = _mm_load_ps(xfm.fMap[0]); + __m128 mc1 = _mm_load_ps(xfm.fMap[1]); + __m128 mc2 = _mm_load_ps(xfm.fMap[2]); + __m128 mwt = _mm_set_ps1(wgt); + + ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt); + ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt); #endif // HS_SSE3 } +#ifdef HS_SSE41 +static inline void ISkinDpSSE41(const float* src, float* dst, const __m128& mc0, + const __m128& mc1, const __m128& mc2, const __m128& mwt) +{ + enum { DP_F4_X = 0xF1, DP_F4_Y = 0xF2, DP_F4_Z = 0xF4 }; + + __m128 msr = _mm_load_ps(src); + __m128 _r = _mm_dp_ps(msr, mc0, DP_F4_X); + _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc1, DP_F4_Y)); + _r = _mm_or_ps(_r, _mm_dp_ps(msr, mc2, DP_F4_Z)); + + __m128 _dst = _mm_load_ps(dst); + _dst = _mm_add_ps(_dst, _mm_mul_ps(_r, mwt)); + _mm_store_ps(dst, _dst); +} +#endif // HS_SSE41 + +static inline void ISkinVertexSSE41(const hsMatrix44& xfm, float wgt, + const float* pt_src, float* pt_dst, + const float* vec_src, float* vec_dst) +{ +#ifdef HS_SSE41 + __m128 mc0 = _mm_load_ps(xfm.fMap[0]); + __m128 mc1 = _mm_load_ps(xfm.fMap[1]); + __m128 mc2 = _mm_load_ps(xfm.fMap[2]); + __m128 mwt = _mm_set_ps1(wgt); + + ISkinDpSSE41(pt_src, pt_dst, mc0, mc1, mc2, mwt); + ISkinDpSSE41(vec_src, vec_dst, mc0, mc1, mc2, mwt); +#endif // HS_SSE41 +} + +typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*); + +template +static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices, + const uint8_t* src, uint8_t format, uint32_t srcStride, + uint8_t* dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans) +{ + ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; + ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; + hsPoint3* pt = reinterpret_cast(pt_buf); + hsVector3* vec = reinterpret_cast(vec_buf); + + uint32_t indices; + float weights[4]; + + // Dropped support for localUVWChans at templatization of code + hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?"); + const size_t uvChanSize = plGBufferGroup::CalcNumUVs(format) * sizeof(float) * 3; + uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4; + + for (uint32_t i = 0; i < count; ++i) { + // Extract data + src = inlExtract(src, pt); + + float weightSum = 0.f; + for (uint8_t j = 0; j < numWeights; ++j) { + src = inlExtract(src, &weights[j]); + weightSum += weights[j]; + } + weights[numWeights] = 1.f - weightSum; + + if (format & plGBufferGroup::kSkinIndices) + src = inlExtract(src, &indices); + else + indices = 1 << 8; + src = inlExtract(src, vec); + + // Destination buffers (float4 for SSE alignment) + ALIGN(16) float destNorm_buf[] = { 0.f, 0.f, 0.f, 0.f }; + ALIGN(16) float destPt_buf[] = { 0.f, 0.f, 0.f, 1.f }; + + // Blend + for (uint32_t j = 0; j < numWeights + 1; ++j) { + if (weights[j]) + T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf); + indices >>= 8; + } + // Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth". + /* hsFastMath::NormalizeAppr(destNorm); */ + + // Slam data into position now + dest = inlStuff(dest, reinterpret_cast(destPt_buf)); + dest = inlStuff(dest, reinterpret_cast(destNorm_buf)); + + // Jump past colors and UVws + dest += sizeof(uint32_t) * 2 + uvChanSize; + src += sizeof(uint32_t) * 2 + uvChanSize; + } +} + +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher plDXPipeline::blend_vert_buffer( + IBlendVertBuffer, 0, 0, IBlendVertBuffer, 0, + IBlendVertBuffer); + // ISetPipeConsts ////////////////////////////////////////////////////////////////// // A shader can request that the pipeline fill in certain constants that are indeterminate // until the pipeline is about to render the object the shader is applied to. For example, diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h index b14cc03c..0edf37dc 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h @@ -354,6 +354,7 @@ protected: void ICheckStaticVertexBuffer(plDXVertexBufferRef* vRef, plGBufferGroup* owner, uint32_t idx); void ICheckIndexBuffer(plDXIndexBufferRef* iRef); void IFillStaticVertexBufferRef(plDXVertexBufferRef *ref, plGBufferGroup *group, uint32_t idx); + void IFillVolatileVertexBufferRef(plDXVertexBufferRef* ref, plGBufferGroup* group, uint32_t idx); void IFillIndexBufferRef(plDXIndexBufferRef* iRef, plGBufferGroup* owner, uint32_t idx); void ISetupVertexBufferRef(plGBufferGroup* owner, uint32_t idx, plDXVertexBufferRef* vRef); void ISetupIndexBufferRef(plGBufferGroup* owner, uint32_t idx, plDXIndexBufferRef* iRef); @@ -804,8 +805,6 @@ public: // CPU-optimized functions protected: typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); - static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); - static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); static hsFunctionDispatcher blend_vert_buffer; };