diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt
index 3c23afba..cc169b75 100644
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@@ -74,6 +74,7 @@ endif(UNIX)
 
 set(CoreLib_HEADERS
     HeadSpin.h
+    hsAlignedAllocator.hpp
     hsBiExpander.h
     hsBitVector.h
     hsBounds.h
diff --git a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
new file mode 100644
index 00000000..b3c2fcea
--- /dev/null
+++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
@@ -0,0 +1,129 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011  Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+      Cyan Worlds, Inc.
+      14617 N Newport Hwy
+      Mead, WA   99021
+
+*==LICENSE==*/
+
+#ifndef _HS_ALIGNED_ALLOCATOR_H
+#define _HS_ALIGNED_ALLOCATOR_H
+
+#include "HeadSpin.h"
+
+template<class T, size_t ALIGNMENT=16>
+
+/**
+ * An aligned allocator for storing SIMD ready values in STL containers
+ * \remarks Based on https://gist.github.com/donny-dont/1471329
+ */
+class hsAlignedAllocator
+{
+    hsAlignedAllocator& operator=(const hsAlignedAllocator&) { }
+
+public:
+    template <typename U, size_t ALIGNMENT=16>
+    struct rebind
+    {
+        typedef hsAlignedAllocator<U, ALIGNMENT> other;
+    };
+
+    typedef T* pointer;
+    typedef const T* const_pointer;
+    typedef T& reference;
+    typedef const T& const_reference;
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+
+    hsAlignedAllocator() { }
+    hsAlignedAllocator(const hsAlignedAllocator&) { }
+    template <typename U> hsAlignedAllocator(const hsAlignedAllocator<U, ALIGNMENT>&) { }
+    ~hsAlignedAllocator() { }
+
+    pointer address(reference r) const { return &r; }
+    const_pointer address(const_reference r) const { return &r; }
+
+    pointer allocate(size_type size, const_pointer hint=nullptr)
+    {
+        if (size == 0)
+            return nullptr;
+        if (size > max_size())
+            throw std::length_error("integer overflow");
+
+#ifdef HS_BUILD_FOR_WIN32
+        void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT);
+#else
+        void* ptr = nullptr;
+        posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type));
+#endif // HS_BUILD_FOR_WIN32
+
+        if (!ptr)
+            throw std::bad_alloc();
+        return static_cast<pointer>(ptr);
+    }
+
+    void construct(T* const p, const_reference t) const
+    {
+        void * const pv = static_cast<void *>(p);
+        new (pv) value_type(t);
+    }
+
+
+    void deallocate(pointer ptr, size_type size)
+    {
+#ifdef HS_BUILD_FOR_WIN32
+        _aligned_free(ptr);
+#else
+        free(ptr);
+#endif // HS_BUILD_FOR_WIN32
+    }
+
+    void destroy(T* const p) const
+    {
+        p->~T();
+    }
+
+    size_type max_size() const
+    {
+        return static_cast<size_t>(-1) / sizeof(value_type);
+    }
+
+    bool operator==(const hsAlignedAllocator& other) const { return true; }
+};
+
+#endif // _HS_ALIGNED_ALLOCATOR_H
diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h
index 34ee618a..346a3ed7 100644
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@@ -61,7 +61,11 @@ struct hsMatrix44 {
         kView
     };
     float            fMap[4][4];
-    uint32_t         fFlags;
+    union
+    {
+        uint8_t      alignment[16];
+        uint32_t     fFlags;
+    };
 
     hsMatrix44() : fFlags(0) {}
     hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate);
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
index 603f793b..c8301139 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@@ -1045,10 +1045,10 @@ void    plDrawableSpans::Read( hsStream* s, hsResMgr* mgr )
 
     /// Read in the matrix palette (if any)
     count = s->ReadLE32();
-    fLocalToWorlds.SetCount(count);
-    fWorldToLocals.SetCount(count);
-    fLocalToBones.SetCount(count);
-    fBoneToLocals.SetCount(count);
+    fLocalToWorlds.resize(count);
+    fWorldToLocals.resize(count);
+    fLocalToBones.resize(count);
+    fBoneToLocals.resize(count);
     for( i = 0; i < count; i++ )
     {
         fLocalToWorlds[i].Read(s);
@@ -2209,16 +2209,12 @@ uint32_t  plDrawableSpans::AppendDIMatrixSpans(int n)
     if( fNeedCleanup )
         IRemoveGarbage();
 
-    uint32_t baseIdx = fLocalToWorlds.GetCount();
-    fLocalToWorlds.Expand(baseIdx + n);
-    fLocalToWorlds.SetCount(baseIdx + n);
-    fWorldToLocals.Expand(baseIdx + n);
-    fWorldToLocals.SetCount(baseIdx + n);
+    uint32_t baseIdx = fLocalToWorlds.size();
+    fLocalToWorlds.resize(baseIdx + n);
+    fWorldToLocals.resize(baseIdx + n);
 
-    fLocalToBones.Expand(baseIdx + n);
-    fLocalToBones.SetCount(baseIdx + n);
-    fBoneToLocals.Expand(baseIdx + n);
-    fBoneToLocals.SetCount(baseIdx + n);
+    fLocalToBones.resize(baseIdx + n);
+    fBoneToLocals.resize(baseIdx + n);
 
     int i;
     for( i = baseIdx; i < baseIdx + n; i++ )
@@ -2267,7 +2263,7 @@ uint32_t plDrawableSpans::FindBoneBaseMatrix(const hsTArray<hsMatrix44>& initL2B
         // runtime, a sharable bone pallete won't be found by scanning fSpans.
         // We have to do a larger search through all bone matrices.
         int i;
-        for( i = 0; i + initL2B.GetCount() < fLocalToBones.GetCount(); i++ )
+        for( i = 0; i + initL2B.GetCount() < fLocalToBones.size(); i++ )
         {
             int j;
             for( j = 0; j < initL2B.GetCount(); j++ )
@@ -2894,7 +2890,7 @@ void    plDrawableSpans::ICleanupMatrices()
         }
     }
 
-    for( j = 0; j < fLocalToWorlds.GetCount(); j++ )
+    for( j = 0; j < fLocalToWorlds.size(); j++ )
     {
         if( !usedMatrices.IsBitSet(j) )
         {
@@ -2910,7 +2906,7 @@ void    plDrawableSpans::ICleanupMatrices()
                     }
                 }
             }
-            for( i = j+1; i < fLocalToWorlds.GetCount(); i++ )
+            for( i = j+1; i < fLocalToWorlds.size(); i++ )
             {
                 fLocalToWorlds[i] = fLocalToWorlds[i-1];
                 fWorldToLocals[i] = fWorldToLocals[i-1];
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
index 58c054b2..796496c5 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
@@ -63,13 +63,14 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #ifndef _plDrawableSpans_h
 #define _plDrawableSpans_h
 
-
+#include "hsAlignedAllocator.hpp"
 #include "hsBitVector.h"
 #include "hsTemplates.h"
 #include "plDrawable.h"
 #include "hsBounds.h"
 #include "hsMatrix44.h"
 #include "plSpanTypes.h"
+#include <vector>
 
 class plPipeline;
 class plMessage;
@@ -131,11 +132,11 @@ class plDrawableSpans : public plDrawable
         hsMatrix44          fLocalToWorld;
         hsMatrix44          fWorldToLocal;
 
-        hsTArray<hsMatrix44>    fLocalToWorlds;
-        hsTArray<hsMatrix44>    fWorldToLocals;
+        std::vector<hsMatrix44, hsAlignedAllocator<hsMatrix44>> fLocalToWorlds; // used in SIMD skinning
+        std::vector<hsMatrix44> fWorldToLocals;
 
-        hsTArray<hsMatrix44>    fLocalToBones;
-        hsTArray<hsMatrix44>    fBoneToLocals;
+        std::vector<hsMatrix44> fLocalToBones;
+        std::vector<hsMatrix44> fBoneToLocals;
 
         hsTArray<hsGMaterial *> fMaterials;
 
@@ -283,7 +284,7 @@ class plDrawableSpans : public plDrawable
         virtual uint32_t                     GetNumSpans( void ) const { return fSpans.GetCount(); }
         virtual const hsTArray<plSpan *>    &GetSpanArray( void ) const { return fSpans; }
 
-        hsMatrix44* GetMatrixPalette(int baseMatrix) const { return &fLocalToWorlds[baseMatrix]; }
+        hsMatrix44* GetMatrixPalette(int baseMatrix) const { return const_cast<hsMatrix44*>(&fLocalToWorlds[baseMatrix]); }
         const hsMatrix44& GetPaletteMatrix(int i) const { return fLocalToWorlds[i]; }
         void SetInitialBone(int i, const hsMatrix44& l2b, const hsMatrix44& b2l);
 
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
index 27078fb9..0abb0d8f 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpansExport.cpp
@@ -163,7 +163,7 @@ void    plDrawableSpans::Write( hsStream* s, hsResMgr* mgr )
             fSourceSpans[ i ]->Write( s );
     }
 
-    count = fLocalToWorlds.GetCount();
+    count = fLocalToWorlds.size();
     s->WriteLE32(count);
     for( i = 0; i < count; i++ )
     {
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index ca51e86d..d4ad28d4 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -203,10 +203,10 @@ void plReleaseObject(IUnknown* x)
 //// Local Static Stuff ///////////////////////////////////////////////////////
 
 /// Macros for getting/setting data in a D3D vertex buffer
-inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple& point )
+inline uint8_t* inlStuffPoint( uint8_t* ptr, const hsScalarTriple* point )
 {
     register float* dst = (float*)ptr;
-    register const float* src = (float*)&point.fX;
+    register const float* src = (float*)&point->fX;
     *dst++ = *src++;
     *dst++ = *src++;
     *dst++ = *src++;
@@ -217,10 +217,10 @@ inline uint8_t* inlStuffUInt32( uint8_t* ptr, const uint32_t uint )
     *(uint32_t*)ptr = uint;
     return ptr + sizeof(uint);
 }
-inline uint8_t* inlExtractPoint( const uint8_t* ptr, const hsScalarTriple& pt )
+inline uint8_t* inlExtractPoint( const uint8_t* ptr, hsScalarTriple* pt )
 {
     register const float* src = (float*)ptr;
-    register float* dst = (float*)&pt.fX;
+    register float* dst = (float*)&pt->fX;
     *dst++ = *src++;
     *dst++ = *src++;
     *dst++ = *src++;
@@ -10617,14 +10617,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
 // SSE3 version
 #ifdef HS_SSE3
 #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
-        ALIGN(16) float hack[4]; \
-        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
-        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
-        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2, _dst; \
+        mc0 = _mm_load_ps(xfm.fMap[0]); \
+        mc1 = _mm_load_ps(xfm.fMap[1]); \
+        mc2 = _mm_load_ps(xfm.fMap[2]); \
         mwt = _mm_set_ps1(wgt);
-#define MATRIXMULTPOINTADD_SSE3(dst, src) \
-        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
+#define MATRIXMULTBUFADD_SSE3(dst, src) \
+        msr = _mm_load_ps(src); \
         _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
         _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
         _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
@@ -10632,10 +10631,9 @@ inline void inlTESTPOINT(const hsPoint3& destP,
         hbuf1 = _mm_hadd_ps(_x, _y); \
         hbuf2 = _mm_hadd_ps(_z, _z); \
         hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        _mm_store_ps(hack, hbuf1); \
-        dst.fX += hack[0]; \
-        dst.fY += hack[1]; \
-        dst.fZ += hack[2];
+        _dst = _mm_load_ps(dst); \
+        _dst = _mm_add_ps(_dst, hbuf1); \
+        _mm_store_ps(dst, _dst);
 #define MATRIXMULTVECTORADD_SSE3(dst, src) \
         msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
         _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
@@ -10645,10 +10643,13 @@ inline void inlTESTPOINT(const hsPoint3& destP,
         hbuf1 = _mm_hadd_ps(_x, _y); \
         hbuf2 = _mm_hadd_ps(_z, _z); \
         hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-        _mm_store_ps(hack, hbuf1); \
-        dst.fX += hack[0]; \
-        dst.fY += hack[1]; \
-        dst.fZ += hack[2];
+        { \
+            ALIGN(16) float hack[4]; \
+            _mm_store_ps(hack, hbuf1); \
+            dst.fX += hack[0]; \
+            dst.fY += hack[1]; \
+            dst.fZ += hack[2]; \
+        }
 #endif
 
 // CPU-optimized functions requiring dispatch
@@ -10656,11 +10657,17 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
 
 // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
 #define BLENDVERTSTART \
-    uint8_t     numUVs, numWeights; \
-    uint32_t    i, j, indices, color, specColor, uvChanSize; \
-    float       weights[ 4 ], weightSum; \
-    hsPoint3    pt, tempPt, destPt; \
-    hsVector3   vec, tempNorm, destNorm; \
+    ALIGN(16) float pt_buf[] = { 0.f, 0.f, 0.f, 1.f }; \
+    ALIGN(16) float vec_buf[] = { 0.f, 0.f, 0.f, 0.f }; \
+    ALIGN(16) float destPt_buf[4], destNorm_buf[4]; \
+    hsPoint3*       pt = reinterpret_cast<hsPoint3*>(pt_buf); \
+    hsPoint3*       destPt = reinterpret_cast<hsPoint3*>(destPt_buf); \
+    hsVector3*      vec = reinterpret_cast<hsVector3*>(vec_buf); \
+    hsVector3*      destNorm = reinterpret_cast<hsVector3*>(destNorm_buf); \
+    \
+    uint8_t         numUVs, numWeights; \
+    uint32_t        i, j, indices, color, specColor, uvChanSize; \
+    float           weights[ 4 ], weightSum; \
     \
     /* Get some counts */\
     switch( format & plGBufferGroup::kSkinWeightMask ) \
@@ -10706,8 +10713,9 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
             src = inlExtractUInt32( src, specColor ); \
             \
             /* Blend */\
-            destPt.Set( 0, 0, 0 ); \
-            destNorm.Set( 0, 0, 0 ); \
+            destPt->Set(0.f, 0.f, 0.f); \
+            destPt_buf[3] = 1.f; \
+            destNorm->Set(0.f, 0.f, 0.f); \
             for( j = 0; j < numWeights + 1; j++ ) \
             { \
                 if( weights[ j ] ) \
@@ -10772,15 +10780,16 @@ hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_ve
             uint8_t k; \
             for( k = 0; k < numUVs; k++ ) \
             { \
-                src = inlExtractPoint( src, srcUVWs[k] ); \
+                src = inlExtractPoint( src, &srcUVWs[k] ); \
             } \
             memcpy( dstUVWs, srcUVWs, uvChanSize); \
             dstUVWs[loChan].Set(0,0,0); \
             dstUVWs[hiChan].Set(0,0,0); \
             \
             /* Blend */\
-            destPt.Set( 0, 0, 0 ); \
-            destNorm.Set( 0, 0, 0 ); \
+            destPt->Set(0.f, 0.f, 0.f); \
+            destPt_buf[3] = 1.f; \
+            destNorm->Set(0.f, 0.f, 0.f); \
             for( j = 0; j < numWeights + 1; j++ ) \
             { \
                 if( weights[ j ] ) \
@@ -10823,13 +10832,13 @@ void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
     BLENDVERTSTART
                     MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
 
-                    MATRIXMULTPOINTADD_FPU(destPt, pt);
-                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
     BLENDVERTMID
                     MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
 
-                    MATRIXMULTPOINTADD_FPU(destPt, pt);
-                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTPOINTADD_FPU((*destPt), (*pt));
+                    MATRIXMULTVECTORADD_FPU((*destNorm), (*vec));
                     MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
                     MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
 
@@ -10846,13 +10855,13 @@ void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
     BLENDVERTSTART
                     MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
 
-                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
-                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
     BLENDVERTMID
                     MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
 
-                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
-                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTBUFADD_SSE3(destPt_buf, pt_buf);
+                    MATRIXMULTBUFADD_SSE3(destNorm_buf, vec_buf);
                     MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
                     MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
     BLENDVERTEND