diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt index 3c23afba..cc169b75 100644 --- a/Sources/Plasma/CoreLib/CMakeLists.txt +++ b/Sources/Plasma/CoreLib/CMakeLists.txt @@ -74,6 +74,7 @@ endif(UNIX) set(CoreLib_HEADERS HeadSpin.h + hsAlignedAllocator.hpp hsBiExpander.h hsBitVector.h hsBounds.h diff --git a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp new file mode 100644 index 00000000..b3c2fcea --- /dev/null +++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp @@ -0,0 +1,129 @@ +/*==LICENSE==* + +CyanWorlds.com Engine - MMOG client, server and tools +Copyright (C) 2011 Cyan Worlds, Inc. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Additional permissions under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK, +NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent +JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK +(or a modified version of those libraries), +containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA, +PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG +JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the +licensors of this Program grant you additional +permission to convey the resulting work. Corresponding Source for a +non-source form of such a combination shall include the source code for +the parts of OpenSSL and IJG JPEG Library used as well as that of the covered +work. + +You can contact Cyan Worlds, Inc. by email legal@cyan.com + or by snail mail at: + Cyan Worlds, Inc. + 14617 N Newport Hwy + Mead, WA 99021 + +*==LICENSE==*/ + +#ifndef _HS_ALIGNED_ALLOCATOR_H +#define _HS_ALIGNED_ALLOCATOR_H + +#include "HeadSpin.h" + +template + +/** + * An aligned allocator for storing SIMD ready values in STL containers + * \remarks Based on https://gist.github.com/donny-dont/1471329 + */ +class hsAlignedAllocator +{ + hsAlignedAllocator& operator=(const hsAlignedAllocator&) { } + +public: + template + struct rebind + { + typedef hsAlignedAllocator other; + }; + + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + + hsAlignedAllocator() { } + hsAlignedAllocator(const hsAlignedAllocator&) { } + template hsAlignedAllocator(const hsAlignedAllocator&) { } + ~hsAlignedAllocator() { } + + pointer address(reference r) const { return &r; } + const_pointer address(const_reference r) const { return &r; } + + pointer allocate(size_type size, const_pointer hint=nullptr) + { + if (size == 0) + return nullptr; + if (size > max_size()) + throw std::length_error("integer overflow"); + +#ifdef HS_BUILD_FOR_WIN32 + void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT); +#else + void* ptr = nullptr; + posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type)); +#endif // HS_BUILD_FOR_WIN32 + + if (!ptr) + throw std::bad_alloc(); + return static_cast(ptr); + } + + void construct(T* const p, const_reference t) const + { + void * const pv = static_cast(p); + new (pv) value_type(t); + } + + + void deallocate(pointer ptr, size_type size) + { +#ifdef HS_BUILD_FOR_WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif // HS_BUILD_FOR_WIN32 + } + + void destroy(T* const p) const + { + p->~T(); + } + + size_type max_size() const + { + return static_cast(-1) / sizeof(value_type); + } + + bool operator==(const hsAlignedAllocator& other) const { return true; } +}; + +#endif // _HS_ALIGNED_ALLOCATOR_H diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h index 34ee618a..346a3ed7 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.h +++ b/Sources/Plasma/CoreLib/hsMatrix44.h @@ -61,7 +61,11 @@ struct hsMatrix44 { kView }; float fMap[4][4]; - uint32_t fFlags; + union + { + uint8_t alignment[16]; + uint32_t fFlags; + }; hsMatrix44() : fFlags(0) {} hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate); diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp index 603f793b..c8301139 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp @@ -1045,10 +1045,10 @@ void plDrawableSpans::Read( hsStream* s, hsResMgr* mgr ) /// Read in the matrix palette (if any) count = s->ReadLE32(); - fLocalToWorlds.SetCount(count); - fWorldToLocals.SetCount(count); - fLocalToBones.SetCount(count); - fBoneToLocals.SetCount(count); + fLocalToWorlds.resize(count); + fWorldToLocals.resize(count); + fLocalToBones.resize(count); + fBoneToLocals.resize(count); for( i = 0; i < count; i++ ) { fLocalToWorlds[i].Read(s); @@ -2209,16 +2209,12 @@ uint32_t plDrawableSpans::AppendDIMatrixSpans(int n) if( fNeedCleanup ) IRemoveGarbage(); - uint32_t baseIdx = fLocalToWorlds.GetCount(); - fLocalToWorlds.Expand(baseIdx + n); - fLocalToWorlds.SetCount(baseIdx + n); - fWorldToLocals.Expand(baseIdx + n); - fWorldToLocals.SetCount(baseIdx + n); + uint32_t baseIdx = fLocalToWorlds.size(); + fLocalToWorlds.resize(baseIdx + n); + fWorldToLocals.resize(baseIdx + n); - fLocalToBones.Expand(baseIdx + n); - fLocalToBones.SetCount(baseIdx + n); - fBoneToLocals.Expand(baseIdx + n); - fBoneToLocals.SetCount(baseIdx + n); + fLocalToBones.resize(baseIdx + n); + fBoneToLocals.resize(baseIdx + n); int i; for( i = baseIdx; i < baseIdx + n; i++ ) @@ -2267,7 +2263,7 @@ uint32_t plDrawableSpans::FindBoneBaseMatrix(const hsTArray& initL2B // runtime, a sharable bone pallete won't be found by scanning fSpans. // We have to do a larger search through all bone matrices. int i; - for( i = 0; i + initL2B.GetCount() < fLocalToBones.GetCount(); i++ ) + for( i = 0; i + initL2B.GetCount() < fLocalToBones.size(); i++ ) { int j; for( j = 0; j < initL2B.GetCount(); j++ ) @@ -2894,7 +2890,7 @@ void plDrawableSpans::ICleanupMatrices() } } - for( j = 0; j < fLocalToWorlds.GetCount(); j++ ) + for( j = 0; j < fLocalToWorlds.size(); j++ ) { if( !usedMatrices.IsBitSet(j) ) { @@ -2910,7 +2906,7 @@ void plDrawableSpans::ICleanupMatrices() } } } - for( i = j+1; i < fLocalToWorlds.GetCount(); i++ ) + for( i = j+1; i < fLocalToWorlds.size(); i++ ) { fLocalToWorlds[i] = fLocalToWorlds[i-1]; fWorldToLocals[i] = fWorldToLocals[i-1]; diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h index 58c054b2..796496c5 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h @@ -63,13 +63,14 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #ifndef _plDrawableSpans_h #define _plDrawableSpans_h - +#include "hsAlignedAllocator.hpp" #include "hsBitVector.h" #include "hsTemplates.h" #include "plDrawable.h" #include "hsBounds.h" #include "hsMatrix44.h" #include "plSpanTypes.h" +#include class plPipeline; class plMessage; @@ -131,11 +132,11 @@ class plDrawableSpans : public plDrawable hsMatrix44 fLocalToWorld; hsMatrix44 fWorldToLocal; - hsTArray fLocalToWorlds; - hsTArray fWorldToLocals; + std::vector> fLocalToWorlds; // used in SIMD skinning + std::vector fWorldToLocals; - hsTArray fLocalToBones; - hsTArray fBoneToLocals; + std::vector fLocalToBones; + std::vector fBoneToLocals; hsTArray fMaterials; @@ -283,7 +284,7 @@ class plDrawableSpans : public plDrawable virtual uint32_t GetNumSpans( void ) const { return fSpans.GetCount(); } virtual const hsTArray &GetSpanArray( void ) const { return fSpans; } - hsMatrix44* GetMatrixPalette(int baseMatrix) const { return &fLocalToWorlds[baseMatrix]; } + hsMatrix44* GetMatrixPalette(int baseMatrix) const { return const_cast(&fLocalToWorlds[baseMatrix]); } const hsMatrix44& GetPaletteMatrix(int i) const { return fLocalToWorlds[i]; } void SetInitialBone(int i, const hsMatrix44& l2b, const hsMatrix44& b2l); diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp index 27078fb9..0abb0d8f 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp @@ -163,7 +163,7 @@ void plDrawableSpans::Write( hsStream* s, hsResMgr* mgr ) fSourceSpans[ i ]->Write( s ); } - count = fLocalToWorlds.GetCount(); + count = fLocalToWorlds.size(); s->WriteLE32(count); for( i = 0; i < count; i++ ) { diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index ca51e86d..d4ad28d4 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -203,10 +203,10 @@ void plReleaseObject(IUnknown* x) //// Local Static Stuff /////////////////////////////////////////////////////// /// Macros for getting/setting data in a D3D vertex buffer -inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple& point ) +inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point ) { register float* dst = (float*)ptr; - register const float* src = (float*)&point.fX; + register const float* src = (float*)&point->fX; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; @@ -217,10 +217,10 @@ inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint ) *(uint32_t*)ptr = uint; return ptr + sizeof(uint); } -inline uint8_t* inlExtractPoint( const uint8_t* ptr, const hsScalarTriple& pt ) +inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt ) { register const float* src = (float*)ptr; - register float* dst = (float*)&pt.fX; + register float* dst = (float*)&pt->fX; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; @@ -10617,14 +10617,13 @@ inline void inlTESTPOINT(const hsPoint3& destP, // SSE3 version #ifdef HS_SSE3 #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ - __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ - ALIGN(16) float hack[4]; \ - mc0 = _mm_loadu_ps(xfm.fMap[0]); \ - mc1 = _mm_loadu_ps(xfm.fMap[1]); \ - mc2 = _mm_loadu_ps(xfm.fMap[2]); \ + __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \ + mc0 = _mm_load_ps(xfm.fMap[0]); \ + mc1 = _mm_load_ps(xfm.fMap[1]); \ + mc2 = _mm_load_ps(xfm.fMap[2]); \ mwt = _mm_set_ps1(wgt); -#define MATRIXMULTPOINTADD_SSE3(dst, src) \ - msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ +#define MATRIXMULTBUFADD_SSE3(dst, src) \ + msr = _mm_load_ps(src); \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ @@ -10632,10 +10631,9 @@ inline void inlTESTPOINT(const hsPoint3& destP, hbuf1 = _mm_hadd_ps(_x, _y); \ hbuf2 = _mm_hadd_ps(_z, _z); \ hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; + _dst = _mm_load_ps(dst); \ + _dst = _mm_add_ps(_dst, hbuf1); \ + _mm_store_ps(dst, _dst); #define MATRIXMULTVECTORADD_SSE3(dst, src) \ msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ @@ -10645,10 +10643,13 @@ inline void inlTESTPOINT(const hsPoint3& destP, hbuf1 = _mm_hadd_ps(_x, _y); \ hbuf2 = _mm_hadd_ps(_z, _z); \ hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ - _mm_store_ps(hack, hbuf1); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - dst.fZ += hack[2]; + { \ + ALIGN(16) float hack[4]; \ + _mm_store_ps(hack, hbuf1); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + dst.fZ += hack[2]; \ + } #endif // CPU-optimized functions requiring dispatch @@ -10656,11 +10657,17 @@ hsFunctionDispatcher plDXPipeline::blend_ve // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication #define BLENDVERTSTART \ - uint8_t numUVs, numWeights; \ - uint32_t i, j, indices, color, specColor, uvChanSize; \ - float weights[ 4 ], weightSum; \ - hsPoint3 pt, tempPt, destPt; \ - hsVector3 vec, tempNorm, destNorm; \ + ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \ + ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \ + ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \ + hsPoint3* pt = reinterpret_cast(pt_buf); \ + hsPoint3* destPt = reinterpret_cast(destPt_buf); \ + hsVector3* vec = reinterpret_cast(vec_buf); \ + hsVector3* destNorm = reinterpret_cast(destNorm_buf); \ + \ + uint8_t numUVs, numWeights; \ + uint32_t i, j, indices, color, specColor, uvChanSize; \ + float weights[ 4 ], weightSum; \ \ /* Get some counts */\ switch( format & plGBufferGroup::kSkinWeightMask ) \ @@ -10706,8 +10713,9 @@ hsFunctionDispatcher plDXPipeline::blend_ve src = inlExtractUInt32( src, specColor ); \ \ /* Blend */\ - destPt.Set( 0, 0, 0 ); \ - destNorm.Set( 0, 0, 0 ); \ + destPt->Set(0.f, 0.f, 0.f); \ + destPt_buf[3] = 1.f; \ + destNorm->Set(0.f, 0.f, 0.f); \ for( j = 0; j < numWeights + 1; j++ ) \ { \ if( weights[ j ] ) \ @@ -10772,15 +10780,16 @@ hsFunctionDispatcher plDXPipeline::blend_ve uint8_t k; \ for( k = 0; k < numUVs; k++ ) \ { \ - src = inlExtractPoint( src, srcUVWs[k] ); \ + src = inlExtractPoint( src, &srcUVWs[k] ); \ } \ memcpy( dstUVWs, srcUVWs, uvChanSize); \ dstUVWs[loChan].Set(0,0,0); \ dstUVWs[hiChan].Set(0,0,0); \ \ /* Blend */\ - destPt.Set( 0, 0, 0 ); \ - destNorm.Set( 0, 0, 0 ); \ + destPt->Set(0.f, 0.f, 0.f); \ + destPt_buf[3] = 1.f; \ + destNorm->Set(0.f, 0.f, 0.f); \ for( j = 0; j < numWeights + 1; j++ ) \ { \ if( weights[ j ] ) \ @@ -10823,13 +10832,13 @@ void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, BLENDVERTSTART MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_FPU(destPt, pt); - MATRIXMULTVECTORADD_FPU(destNorm, vec); + MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); + MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); BLENDVERTMID MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_FPU(destPt, pt); - MATRIXMULTVECTORADD_FPU(destNorm, vec); + MATRIXMULTPOINTADD_FPU((*destPt), (*pt)); + MATRIXMULTVECTORADD_FPU((*destNorm), (*vec)); MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); @@ -10846,13 +10855,13 @@ void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, BLENDVERTSTART MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_SSE3(destPt, pt); - MATRIXMULTVECTORADD_SSE3(destNorm, vec); + MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); + MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); BLENDVERTMID MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); - MATRIXMULTPOINTADD_SSE3(destPt, pt); - MATRIXMULTVECTORADD_SSE3(destNorm, vec); + MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf); + MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf); MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); BLENDVERTEND