diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt
index 3c23afba..cc169b75 100644
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@@ -74,6 +74,7 @@ endif(UNIX)
set(CoreLib_HEADERS
HeadSpin.h
+ hsAlignedAllocator.hpp
hsBiExpander.h
hsBitVector.h
hsBounds.h
diff --git a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
new file mode 100644
index 00000000..b3c2fcea
--- /dev/null
+++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
@@ -0,0 +1,129 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011 Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+ Cyan Worlds, Inc.
+ 14617 N Newport Hwy
+ Mead, WA 99021
+
+*==LICENSE==*/
+
+#ifndef _HS_ALIGNED_ALLOCATOR_H
+#define _HS_ALIGNED_ALLOCATOR_H
+
+#include "HeadSpin.h"
+
+template
+
+/**
+ * An aligned allocator for storing SIMD ready values in STL containers
+ * \remarks Based on https://gist.github.com/donny-dont/1471329
+ */
+class hsAlignedAllocator
+{
+ hsAlignedAllocator& operator=(const hsAlignedAllocator&) { }
+
+public:
+ template
+ struct rebind
+ {
+ typedef hsAlignedAllocator other;
+ };
+
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef T value_type;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+ hsAlignedAllocator() { }
+ hsAlignedAllocator(const hsAlignedAllocator&) { }
+ template hsAlignedAllocator(const hsAlignedAllocator&) { }
+ ~hsAlignedAllocator() { }
+
+ pointer address(reference r) const { return &r; }
+ const_pointer address(const_reference r) const { return &r; }
+
+ pointer allocate(size_type size, const_pointer hint=nullptr)
+ {
+ if (size == 0)
+ return nullptr;
+ if (size > max_size())
+ throw std::length_error("integer overflow");
+
+#ifdef HS_BUILD_FOR_WIN32
+ void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT);
+#else
+ void* ptr = nullptr;
+ posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type));
+#endif // HS_BUILD_FOR_WIN32
+
+ if (!ptr)
+ throw std::bad_alloc();
+ return static_cast(ptr);
+ }
+
+ void construct(T* const p, const_reference t) const
+ {
+ void * const pv = static_cast(p);
+ new (pv) value_type(t);
+ }
+
+
+ void deallocate(pointer ptr, size_type size)
+ {
+#ifdef HS_BUILD_FOR_WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif // HS_BUILD_FOR_WIN32
+ }
+
+ void destroy(T* const p) const
+ {
+ p->~T();
+ }
+
+ size_type max_size() const
+ {
+ return static_cast(-1) / sizeof(value_type);
+ }
+
+ bool operator==(const hsAlignedAllocator& other) const { return true; }
+};
+
+#endif // _HS_ALIGNED_ALLOCATOR_H
diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h
index 34ee618a..346a3ed7 100644
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@@ -61,7 +61,11 @@ struct hsMatrix44 {
kView
};
float fMap[4][4];
- uint32_t fFlags;
+ union
+ {
+ uint8_t alignment[16];
+ uint32_t fFlags;
+ };
hsMatrix44() : fFlags(0) {}
hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate);
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
index 603f793b..c8301139 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@@ -1045,10 +1045,10 @@ void plDrawableSpans::Read( hsStream* s, hsResMgr* mgr )
/// Read in the matrix palette (if any)
count = s->ReadLE32();
- fLocalToWorlds.SetCount(count);
- fWorldToLocals.SetCount(count);
- fLocalToBones.SetCount(count);
- fBoneToLocals.SetCount(count);
+ fLocalToWorlds.resize(count);
+ fWorldToLocals.resize(count);
+ fLocalToBones.resize(count);
+ fBoneToLocals.resize(count);
for( i = 0; i < count; i++ )
{
fLocalToWorlds[i].Read(s);
@@ -2209,16 +2209,12 @@ uint32_t plDrawableSpans::AppendDIMatrixSpans(int n)
if( fNeedCleanup )
IRemoveGarbage();
- uint32_t baseIdx = fLocalToWorlds.GetCount();
- fLocalToWorlds.Expand(baseIdx + n);
- fLocalToWorlds.SetCount(baseIdx + n);
- fWorldToLocals.Expand(baseIdx + n);
- fWorldToLocals.SetCount(baseIdx + n);
+ uint32_t baseIdx = fLocalToWorlds.size();
+ fLocalToWorlds.resize(baseIdx + n);
+ fWorldToLocals.resize(baseIdx + n);
- fLocalToBones.Expand(baseIdx + n);
- fLocalToBones.SetCount(baseIdx + n);
- fBoneToLocals.Expand(baseIdx + n);
- fBoneToLocals.SetCount(baseIdx + n);
+ fLocalToBones.resize(baseIdx + n);
+ fBoneToLocals.resize(baseIdx + n);
int i;
for( i = baseIdx; i < baseIdx + n; i++ )
@@ -2267,7 +2263,7 @@ uint32_t plDrawableSpans::FindBoneBaseMatrix(const hsTArray& initL2B
// runtime, a sharable bone pallete won't be found by scanning fSpans.
// We have to do a larger search through all bone matrices.
int i;
- for( i = 0; i + initL2B.GetCount() < fLocalToBones.GetCount(); i++ )
+ for( i = 0; i + initL2B.GetCount() < fLocalToBones.size(); i++ )
{
int j;
for( j = 0; j < initL2B.GetCount(); j++ )
@@ -2894,7 +2890,7 @@ void plDrawableSpans::ICleanupMatrices()
}
}
- for( j = 0; j < fLocalToWorlds.GetCount(); j++ )
+ for( j = 0; j < fLocalToWorlds.size(); j++ )
{
if( !usedMatrices.IsBitSet(j) )
{
@@ -2910,7 +2906,7 @@ void plDrawableSpans::ICleanupMatrices()
}
}
}
- for( i = j+1; i < fLocalToWorlds.GetCount(); i++ )
+ for( i = j+1; i < fLocalToWorlds.size(); i++ )
{
fLocalToWorlds[i] = fLocalToWorlds[i-1];
fWorldToLocals[i] = fWorldToLocals[i-1];
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
index 58c054b2..796496c5 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
@@ -63,13 +63,14 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#ifndef _plDrawableSpans_h
#define _plDrawableSpans_h
-
+#include "hsAlignedAllocator.hpp"
#include "hsBitVector.h"
#include "hsTemplates.h"
#include "plDrawable.h"
#include "hsBounds.h"
#include "hsMatrix44.h"
#include "plSpanTypes.h"
+#include
class plPipeline;
class plMessage;
@@ -131,11 +132,11 @@ class plDrawableSpans : public plDrawable
hsMatrix44 fLocalToWorld;
hsMatrix44 fWorldToLocal;
- hsTArray fLocalToWorlds;
- hsTArray fWorldToLocals;
+ std::vector> fLocalToWorlds; // used in SIMD skinning
+ std::vector fWorldToLocals;
- hsTArray fLocalToBones;
- hsTArray fBoneToLocals;
+ std::vector fLocalToBones;
+ std::vector fBoneToLocals;
hsTArray fMaterials;
@@ -283,7 +284,7 @@ class plDrawableSpans : public plDrawable
virtual uint32_t GetNumSpans( void ) const { return fSpans.GetCount(); }
virtual const hsTArray &GetSpanArray( void ) const { return fSpans; }
- hsMatrix44* GetMatrixPalette(int baseMatrix) const { return &fLocalToWorlds[baseMatrix]; }
+ hsMatrix44* GetMatrixPalette(int baseMatrix) const { return const_cast(&fLocalToWorlds[baseMatrix]); }
const hsMatrix44& GetPaletteMatrix(int i) const { return fLocalToWorlds[i]; }
void SetInitialBone(int i, const hsMatrix44& l2b, const hsMatrix44& b2l);
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
index 27078fb9..0abb0d8f 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
@@ -163,7 +163,7 @@ void plDrawableSpans::Write( hsStream* s, hsResMgr* mgr )
fSourceSpans[ i ]->Write( s );
}
- count = fLocalToWorlds.GetCount();
+ count = fLocalToWorlds.size();
s->WriteLE32(count);
for( i = 0; i < count; i++ )
{
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index ca51e86d..d4ad28d4 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -203,10 +203,10 @@ void plReleaseObject(IUnknown* x)
//// Local Static Stuff ///////////////////////////////////////////////////////
/// Macros for getting/setting data in a D3D vertex buffer
-inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple& point )
+inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point )
{
register float* dst = (float*)ptr;
- register const float* src = (float*)&point.fX;
+ register const float* src = (float*)&point->fX;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
@@ -217,10 +217,10 @@ inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint )
*(uint32_t*)ptr = uint;
return ptr + sizeof(uint);
}
-inline uint8_t* inlExtractPoint( const uint8_t* ptr, const hsScalarTriple& pt )
+inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt )
{
register const float* src = (float*)ptr;
- register float* dst = (float*)&pt.fX;
+ register float* dst = (float*)&pt->fX;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
@@ -10617,14 +10617,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
// SSE3 version
#ifdef HS_SSE3
#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
- __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
- ALIGN(16) float hack[4]; \
- mc0 = _mm_loadu_ps(xfm.fMap[0]); \
- mc1 = _mm_loadu_ps(xfm.fMap[1]); \
- mc2 = _mm_loadu_ps(xfm.fMap[2]); \
+ __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
+ mc0 = _mm_load_ps(xfm.fMap[0]); \
+ mc1 = _mm_load_ps(xfm.fMap[1]); \
+ mc2 = _mm_load_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt);
-#define MATRIXMULTPOINTADD_SSE3(dst, src) \
- msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
+#define MATRIXMULTBUFADD_SSE3(dst, src) \
+ msr = _mm_load_ps(src); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
@@ -10632,10 +10631,9 @@ inline void inlTESTPOINT(const hsPoint3& destP,
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
- _mm_store_ps(hack, hbuf1); \
- dst.fX += hack[0]; \
- dst.fY += hack[1]; \
- dst.fZ += hack[2];
+ _dst = _mm_load_ps(dst); \
+ _dst = _mm_add_ps(_dst, hbuf1); \
+ _mm_store_ps(dst, _dst);
#define MATRIXMULTVECTORADD_SSE3(dst, src) \
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
@@ -10645,10 +10643,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
- _mm_store_ps(hack, hbuf1); \
- dst.fX += hack[0]; \
- dst.fY += hack[1]; \
- dst.fZ += hack[2];
+ { \
+ ALIGN(16) float hack[4]; \
+ _mm_store_ps(hack, hbuf1); \
+ dst.fX += hack[0]; \
+ dst.fY += hack[1]; \
+ dst.fZ += hack[2]; \
+ }
#endif
// CPU-optimized functions requiring dispatch
@@ -10656,11 +10657,17 @@ hsFunctionDispatcher plDXPipeline::blend_ve
// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
#define BLENDVERTSTART \
- uint8_t numUVs, numWeights; \
- uint32_t i, j, indices, color, specColor, uvChanSize; \
- float weights[ 4 ], weightSum; \
- hsPoint3 pt, tempPt, destPt; \
- hsVector3 vec, tempNorm, destNorm; \
+ ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
+ ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
+ ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
+ hsPoint3* pt = reinterpret_cast(pt_buf); \
+ hsPoint3* destPt = reinterpret_cast(destPt_buf); \
+ hsVector3* vec = reinterpret_cast(vec_buf); \
+ hsVector3* destNorm = reinterpret_cast(destNorm_buf); \
+ \
+ uint8_t numUVs, numWeights; \
+ uint32_t i, j, indices, color, specColor, uvChanSize; \
+ float weights[ 4 ], weightSum; \
\
/* Get some counts */\
switch( format & plGBufferGroup::kSkinWeightMask ) \
@@ -10706,8 +10713,9 @@ hsFunctionDispatcher plDXPipeline::blend_ve
src = inlExtractUInt32( src, specColor ); \
\
/* Blend */\
- destPt.Set( 0, 0, 0 ); \
- destNorm.Set( 0, 0, 0 ); \
+ destPt->Set(0.f, 0.f, 0.f); \
+ destPt_buf[3] = 1.f; \
+ destNorm->Set(0.f, 0.f, 0.f); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
@@ -10772,15 +10780,16 @@ hsFunctionDispatcher plDXPipeline::blend_ve
uint8_t k; \
for( k = 0; k < numUVs; k++ ) \
{ \
- src = inlExtractPoint( src, srcUVWs[k] ); \
+ src = inlExtractPoint( src, &srcUVWs[k] ); \
} \
memcpy( dstUVWs, srcUVWs, uvChanSize); \
dstUVWs[loChan].Set(0,0,0); \
dstUVWs[hiChan].Set(0,0,0); \
\
/* Blend */\
- destPt.Set( 0, 0, 0 ); \
- destNorm.Set( 0, 0, 0 ); \
+ destPt->Set(0.f, 0.f, 0.f); \
+ destPt_buf[3] = 1.f; \
+ destNorm->Set(0.f, 0.f, 0.f); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
@@ -10823,13 +10832,13 @@ void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
BLENDVERTSTART
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
- MATRIXMULTPOINTADD_FPU(destPt, pt);
- MATRIXMULTVECTORADD_FPU(destNorm, vec);
+ MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+ MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
BLENDVERTMID
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
- MATRIXMULTPOINTADD_FPU(destPt, pt);
- MATRIXMULTVECTORADD_FPU(destNorm, vec);
+ MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+ MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
@@ -10846,13 +10855,13 @@ void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
BLENDVERTSTART
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
- MATRIXMULTPOINTADD_SSE3(destPt, pt);
- MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+ MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+ MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
BLENDVERTMID
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
- MATRIXMULTPOINTADD_SSE3(destPt, pt);
- MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+ MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+ MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND