Browse Source

Cleanup macro-mayhem

This converts the VERTBLEND macros to some clever templates. This code
should be much more maintainable.
Adam Johnson 11 years ago
parent
commit
aa7df368f1
  1. 421
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
  2. 2
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

421
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -10581,293 +10581,154 @@ inline void inlTESTPOINT(const hsPoint3& destP,
// format, blends them into the destination buffer given without the blending
// info.
// FPU version
#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
float m00 = xfm.fMap[0][0]; \
float m01 = xfm.fMap[0][1]; \
float m02 = xfm.fMap[0][2]; \
float m03 = xfm.fMap[0][3]; \
float m10 = xfm.fMap[1][0]; \
float m11 = xfm.fMap[1][1]; \
float m12 = xfm.fMap[1][2]; \
float m13 = xfm.fMap[1][3]; \
float m20 = xfm.fMap[2][0]; \
float m21 = xfm.fMap[2][1]; \
float m22 = xfm.fMap[2][2]; \
float m23 = xfm.fMap[2][3]; \
float m_wgt = wgt; \
float srcX, srcY, srcZ;
#define MATRIXMULTPOINTADD_FPU(dst, src) \
srcX = src.fX; \
srcY = src.fY; \
srcZ = src.fZ; \
\
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
#define MATRIXMULTVECTORADD_FPU(dst, src) \
srcX = src.fX; \
srcY = src.fY; \
srcZ = src.fZ; \
\
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
// SSE3 version
static inline void ISkinVertexFPU(const hsMatrix44& xfm, float wgt,
const float* pt_src, float* pt_dst,
const float* vec_src, float* vec_dst)
{
const float& m00 = xfm.fMap[0][0];
const float& m01 = xfm.fMap[0][1];
const float& m02 = xfm.fMap[0][2];
const float& m03 = xfm.fMap[0][3];
const float& m10 = xfm.fMap[1][0];
const float& m11 = xfm.fMap[1][1];
const float& m12 = xfm.fMap[1][2];
const float& m13 = xfm.fMap[1][3];
const float& m20 = xfm.fMap[2][0];
const float& m21 = xfm.fMap[2][1];
const float& m22 = xfm.fMap[2][2];
const float& m23 = xfm.fMap[2][3];
// position
{
const float& srcX = pt_src[0];
const float& srcY = pt_src[1];
const float& srcZ = pt_src[2];
pt_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * wgt;
pt_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * wgt;
pt_dst[2] += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * wgt;
}
// normal
{
const float& srcX = vec_src[0];
const float& srcY = vec_src[1];
const float& srcZ = vec_src[2];
vec_dst[0] += (srcX * m00 + srcY * m01 + srcZ * m02) * wgt;
vec_dst[1] += (srcX * m10 + srcY * m11 + srcZ * m12) * wgt;
vec_dst[1] += (srcX * m20 + srcY * m21 + srcZ * m22) * wgt;
}
}
#ifdef HS_SSE3
#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
mc0 = _mm_load_ps(xfm.fMap[0]); \
mc1 = _mm_load_ps(xfm.fMap[1]); \
mc2 = _mm_load_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt);
#define MATRIXMULTBUFADD_SSE3(dst, src) \
msr = _mm_load_ps(src); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
_dst = _mm_load_ps(dst); \
_dst = _mm_add_ps(_dst, hbuf1); \
_mm_store_ps(dst, _dst);
#define MATRIXMULTVECTORADD_SSE3(dst, src) \
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
{ \
ALIGN(16) float hack[4]; \
_mm_store_ps(hack, hbuf1); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
dst.fZ += hack[2]; \
}
#endif
static inline void ISkinDpSSE3(const float* src, float* dst, const __m128& mc0,
const __m128& mc1, const __m128& mc2, const __m128& mwt)
{
__m128 msr = _mm_load_ps(src);
__m128 _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt);
__m128 _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt);
__m128 _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt);
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
#define BLENDVERTSTART \
ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
hsPoint3* pt = reinterpret_cast<hsPoint3*>(pt_buf); \
hsPoint3* destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
hsVector3* vec = reinterpret_cast<hsVector3*>(vec_buf); \
hsVector3* destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
\
uint8_t numUVs, numWeights; \
uint32_t i, j, indices, color, specColor, uvChanSize; \
float weights[ 4 ], weightSum; \
\
/* Get some counts */\
switch( format & plGBufferGroup::kSkinWeightMask ) \
{ \
case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \
case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
} \
\
numUVs = plGBufferGroup::CalcNumUVs( format ); \
uvChanSize = numUVs * sizeof( float ) * 3; \
\
/* localUVWChans is bump mapping tangent space vectors, which need to
// be skinned like the normal, as opposed to passed through like
// garden variety UVW coordinates.
// There are no localUVWChans that I know of in production assets (i.e.
// the avatar is not skinned).*/\
if( !localUVWChans ) \
{ \
/* Copy whilst blending */\
for( i = 0; i < count; i++ ) \
{ \
/* Extract data */\
src = inlExtractPoint( src, pt ); \
for( j = 0, weightSum = 0; j < numWeights; j++ ) \
{ \
src = inlExtractFloat( src, weights[ j ] ); \
weightSum += weights[ j ]; \
} \
weights[ j ] = 1 - weightSum; \
\
if( format & plGBufferGroup::kSkinIndices ) \
{ \
src = inlExtractUInt32( src, indices ); \
} \
else \
{ \
indices = 1 << 8; \
} \
src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); \
\
/* Blend */\
destPt->Set(0.f, 0.f, 0.f); \
destPt_buf[3] = 1.f; \
destNorm->Set(0.f, 0.f, 0.f); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
{
/*
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec);
*/
#define BLENDVERTMID \
} \
\
indices >>= 8; \
} \
/* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".*/\
/* hsFastMath::NormalizeAppr(destNorm);*/ \
\
/* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffUInt32( dest, color ); \
dest = inlStuffUInt32( dest, specColor ); \
memcpy( dest, src, uvChanSize ); \
src += uvChanSize; \
dest += uvChanSize; \
} \
} \
else \
{ \
uint8_t hiChan = localUVWChans >> 8; \
uint8_t loChan = localUVWChans & 0xff; \
/* Copy whilst blending */\
for( i = 0; i < count; i++ ) \
{ \
hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
\
/* Extract data */\
src = inlExtractPoint( src, pt ); \
for( j = 0, weightSum = 0; j < numWeights; j++ ) \
{ \
src = inlExtractFloat( src, weights[ j ] ); \
weightSum += weights[ j ]; \
} \
weights[ j ] = 1 - weightSum; \
\
if( format & plGBufferGroup::kSkinIndices ) \
{ \
src = inlExtractUInt32( src, indices ); \
} \
else \
{ \
indices = 1 << 8; \
} \
\
src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); \
\
uint8_t k; \
for( k = 0; k < numUVs; k++ ) \
{ \
src = inlExtractPoint( src, &srcUVWs[k] ); \
} \
memcpy( dstUVWs, srcUVWs, uvChanSize); \
dstUVWs[loChan].Set(0,0,0); \
dstUVWs[hiChan].Set(0,0,0); \
\
/* Blend */\
destPt->Set(0.f, 0.f, 0.f); \
destPt_buf[3] = 1.f; \
destNorm->Set(0.f, 0.f, 0.f); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
{ \
/*
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec);
MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
*/
#define BLENDVERTEND \
} \
\
indices >>= 8; \
} \
/* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth". */\
/* hsFastMath::NormalizeAppr(destNorm); */\
/* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
/* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
\
/* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffUInt32( dest, color ); \
dest = inlStuffUInt32( dest, specColor ); \
memcpy( dest, dstUVWs, uvChanSize ); \
dest += uvChanSize; \
} \
}
void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans )
{
BLENDVERTSTART
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
BLENDVERTMID
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND
}
void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans )
__m128 hbuf1 = _mm_hadd_ps(_x, _y);
__m128 hbuf2 = _mm_hadd_ps(_z, _z);
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2);
__m128 _dst = _mm_load_ps(dst);
_dst = _mm_add_ps(_dst, hbuf1);
_mm_store_ps(dst, _dst);
}
#endif // HS_SSE3
static inline void ISkinVertexSSE3(const hsMatrix44& xfm, float wgt,
const float* pt_src, float* pt_dst,
const float* vec_src, float* vec_dst)
{
#ifdef HS_SSE3
BLENDVERTSTART
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
BLENDVERTMID
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND
__m128 mc0 = _mm_load_ps(xfm.fMap[0]);
__m128 mc1 = _mm_load_ps(xfm.fMap[1]);
__m128 mc2 = _mm_load_ps(xfm.fMap[2]);
__m128 mwt = _mm_set_ps1(wgt);
ISkinDpSSE3(pt_src, pt_dst, mc0, mc1, mc2, mwt);
ISkinDpSSE3(vec_src, vec_dst, mc0, mc1, mc2, mwt);
#endif // HS_SSE3
}
typedef void(*skin_vert_ptr)(const hsMatrix44&, float, const float*, float*, const float*, float*);
template<skin_vert_ptr T>
static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMatrices,
const uint8_t* src, uint8_t format, uint32_t srcStride,
uint8_t* dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans)
{
ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f };
ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f };
ALIGN(16) float destPt_buf[4], destNorm_buf[4];
hsPoint3* pt = reinterpret_cast<hsPoint3*>(pt_buf);
hsPoint3* destPt = reinterpret_cast<hsPoint3*>(destPt_buf);
hsVector3* vec = reinterpret_cast<hsVector3*>(vec_buf);
hsVector3* destNorm = reinterpret_cast<hsVector3*>(destNorm_buf);
uint8_t numUVs;
uint32_t indices, color, specColor, uvChanSize;
float weights[4];
numUVs = plGBufferGroup::CalcNumUVs(format);
uvChanSize = numUVs * sizeof(float) * 3;
uint8_t numWeights = (format & plGBufferGroup::kSkinWeightMask) >> 4;
// Dropped support for localUVWChans at templatization of code
hsAssert(localUVWChans == 0, "support for skinned UVWs dropped. reimplement me?");
for (uint32_t i = 0; i < count; ++i) {
// Extract data
src = inlExtractPoint( src, pt );
float weightSum = 0.f;
for (uint8_t j = 0; j < numWeights; ++j) {
src = inlExtractFloat(src, weights[j]);
weightSum += weights[j];
}
weights[numWeights] = 1.f - weightSum;
if (format & plGBufferGroup::kSkinIndices)
src = inlExtractUInt32( src, indices );
else
indices = 1 << 8;
src = inlExtractPoint( src, vec );
src = inlExtractUInt32( src, color );
src = inlExtractUInt32( src, specColor );
// Blend
destPt->Set(0.f, 0.f, 0.f);
destPt_buf[3] = 1.f;
destNorm->Set(0.f, 0.f, 0.f);
destNorm_buf[3] = 0.f;
for (uint32_t j = 0; j < numWeights + 1; ++j) {
if (weights[j])
T(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
indices >>= 8;
}
// Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".
/* hsFastMath::NormalizeAppr(destNorm); */
// Slam data into position now
dest = inlStuffPoint( dest, destPt );
dest = inlStuffPoint( dest, destNorm );
dest = inlStuffUInt32( dest, color );
dest = inlStuffUInt32( dest, specColor );
memcpy( dest, src, uvChanSize );
src += uvChanSize;
dest += uvChanSize;
}
}
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>);
// ISetPipeConsts //////////////////////////////////////////////////////////////////
// A shader can request that the pipeline fill in certain constants that are indeterminate
// until the pipeline is about to render the object the shader is applied to. For example,

2
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

@ -804,8 +804,6 @@ public:
// CPU-optimized functions
protected:
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
};

Loading…
Cancel
Save