diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt
index 3c23afba..cc169b75 100644
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@@ -74,6 +74,7 @@ endif(UNIX)
set(CoreLib_HEADERS
HeadSpin.h
+ hsAlignedAllocator.hpp
hsBiExpander.h
hsBitVector.h
hsBounds.h
diff --git a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
new file mode 100644
index 00000000..b3c2fcea
--- /dev/null
+++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
@@ -0,0 +1,129 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011 Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+ Cyan Worlds, Inc.
+ 14617 N Newport Hwy
+ Mead, WA 99021
+
+*==LICENSE==*/
+
+#ifndef _HS_ALIGNED_ALLOCATOR_H
+#define _HS_ALIGNED_ALLOCATOR_H
+
+#include "HeadSpin.h"
+
+template
+
+/**
+ * An aligned allocator for storing SIMD ready values in STL containers
+ * \remarks Based on https://gist.github.com/donny-dont/1471329
+ */
+class hsAlignedAllocator
+{
+ hsAlignedAllocator& operator=(const hsAlignedAllocator&) { }
+
+public:
+ template
+ struct rebind
+ {
+ typedef hsAlignedAllocator other;
+ };
+
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef T value_type;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+ hsAlignedAllocator() { }
+ hsAlignedAllocator(const hsAlignedAllocator&) { }
+ template hsAlignedAllocator(const hsAlignedAllocator&) { }
+ ~hsAlignedAllocator() { }
+
+ pointer address(reference r) const { return &r; }
+ const_pointer address(const_reference r) const { return &r; }
+
+ pointer allocate(size_type size, const_pointer hint=nullptr)
+ {
+ if (size == 0)
+ return nullptr;
+ if (size > max_size())
+ throw std::length_error("integer overflow");
+
+#ifdef HS_BUILD_FOR_WIN32
+ void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT);
+#else
+ void* ptr = nullptr;
+ posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type));
+#endif // HS_BUILD_FOR_WIN32
+
+ if (!ptr)
+ throw std::bad_alloc();
+ return static_cast(ptr);
+ }
+
+ void construct(T* const p, const_reference t) const
+ {
+ void * const pv = static_cast(p);
+ new (pv) value_type(t);
+ }
+
+
+ void deallocate(pointer ptr, size_type size)
+ {
+#ifdef HS_BUILD_FOR_WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif // HS_BUILD_FOR_WIN32
+ }
+
+ void destroy(T* const p) const
+ {
+ p->~T();
+ }
+
+ size_type max_size() const
+ {
+ return static_cast(-1) / sizeof(value_type);
+ }
+
+ bool operator==(const hsAlignedAllocator& other) const { return true; }
+};
+
+#endif // _HS_ALIGNED_ALLOCATOR_H
diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h
index 34ee618a..346a3ed7 100644
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@@ -61,7 +61,11 @@ struct hsMatrix44 {
kView
};
float fMap[4][4];
- uint32_t fFlags;
+ union
+ {
+ uint8_t alignment[16];
+ uint32_t fFlags;
+ };
hsMatrix44() : fFlags(0) {}
hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate);
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
index 3c1d12d7..796496c5 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
@@ -63,7 +63,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#ifndef _plDrawableSpans_h
#define _plDrawableSpans_h
-
+#include "hsAlignedAllocator.hpp"
#include "hsBitVector.h"
#include "hsTemplates.h"
#include "plDrawable.h"
@@ -132,7 +132,7 @@ class plDrawableSpans : public plDrawable
hsMatrix44 fLocalToWorld;
hsMatrix44 fWorldToLocal;
- std::vector fLocalToWorlds;
+ std::vector> fLocalToWorlds; // used in SIMD skinning
std::vector fWorldToLocals;
std::vector fLocalToBones;
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index ca51e86d..db5a082a 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -10619,9 +10619,9 @@ inline void inlTESTPOINT(const hsPoint3& destP,
#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
ALIGN(16) float hack[4]; \
- mc0 = _mm_loadu_ps(xfm.fMap[0]); \
- mc1 = _mm_loadu_ps(xfm.fMap[1]); \
- mc2 = _mm_loadu_ps(xfm.fMap[2]); \
+ mc0 = _mm_load_ps(xfm.fMap[0]); \
+ mc1 = _mm_load_ps(xfm.fMap[1]); \
+ mc2 = _mm_load_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt);
#define MATRIXMULTPOINTADD_SSE3(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \