diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt index 3c23afba..cc169b75 100644 --- a/Sources/Plasma/CoreLib/CMakeLists.txt +++ b/Sources/Plasma/CoreLib/CMakeLists.txt @@ -74,6 +74,7 @@ endif(UNIX) set(CoreLib_HEADERS HeadSpin.h + hsAlignedAllocator.hpp hsBiExpander.h hsBitVector.h hsBounds.h diff --git a/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp new file mode 100644 index 00000000..b3c2fcea --- /dev/null +++ b/Sources/Plasma/CoreLib/hsAlignedAllocator.hpp @@ -0,0 +1,129 @@ +/*==LICENSE==* + +CyanWorlds.com Engine - MMOG client, server and tools +Copyright (C) 2011 Cyan Worlds, Inc. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Additional permissions under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK, +NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent +JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK +(or a modified version of those libraries), +containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA, +PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG +JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the +licensors of this Program grant you additional +permission to convey the resulting work. Corresponding Source for a +non-source form of such a combination shall include the source code for +the parts of OpenSSL and IJG JPEG Library used as well as that of the covered +work. + +You can contact Cyan Worlds, Inc. by email legal@cyan.com + or by snail mail at: + Cyan Worlds, Inc. + 14617 N Newport Hwy + Mead, WA 99021 + +*==LICENSE==*/ + +#ifndef _HS_ALIGNED_ALLOCATOR_H +#define _HS_ALIGNED_ALLOCATOR_H + +#include "HeadSpin.h" + +template + +/** + * An aligned allocator for storing SIMD ready values in STL containers + * \remarks Based on https://gist.github.com/donny-dont/1471329 + */ +class hsAlignedAllocator +{ + hsAlignedAllocator& operator=(const hsAlignedAllocator&) { } + +public: + template + struct rebind + { + typedef hsAlignedAllocator other; + }; + + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + + hsAlignedAllocator() { } + hsAlignedAllocator(const hsAlignedAllocator&) { } + template hsAlignedAllocator(const hsAlignedAllocator&) { } + ~hsAlignedAllocator() { } + + pointer address(reference r) const { return &r; } + const_pointer address(const_reference r) const { return &r; } + + pointer allocate(size_type size, const_pointer hint=nullptr) + { + if (size == 0) + return nullptr; + if (size > max_size()) + throw std::length_error("integer overflow"); + +#ifdef HS_BUILD_FOR_WIN32 + void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT); +#else + void* ptr = nullptr; + posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type)); +#endif // HS_BUILD_FOR_WIN32 + + if (!ptr) + throw std::bad_alloc(); + return static_cast(ptr); + } + + void construct(T* const p, const_reference t) const + { + void * const pv = static_cast(p); + new (pv) value_type(t); + } + + + void deallocate(pointer ptr, size_type size) + { +#ifdef HS_BUILD_FOR_WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif // HS_BUILD_FOR_WIN32 + } + + void destroy(T* const p) const + { + p->~T(); + } + + size_type max_size() const + { + return static_cast(-1) / sizeof(value_type); + } + + bool operator==(const hsAlignedAllocator& other) const { return true; } +}; + +#endif // _HS_ALIGNED_ALLOCATOR_H diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h index 34ee618a..346a3ed7 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.h +++ b/Sources/Plasma/CoreLib/hsMatrix44.h @@ -61,7 +61,11 @@ struct hsMatrix44 { kView }; float fMap[4][4]; - uint32_t fFlags; + union + { + uint8_t alignment[16]; + uint32_t fFlags; + }; hsMatrix44() : fFlags(0) {} hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate); diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h index 3c1d12d7..796496c5 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h @@ -63,7 +63,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #ifndef _plDrawableSpans_h #define _plDrawableSpans_h - +#include "hsAlignedAllocator.hpp" #include "hsBitVector.h" #include "hsTemplates.h" #include "plDrawable.h" @@ -132,7 +132,7 @@ class plDrawableSpans : public plDrawable hsMatrix44 fLocalToWorld; hsMatrix44 fWorldToLocal; - std::vector fLocalToWorlds; + std::vector> fLocalToWorlds; // used in SIMD skinning std::vector fWorldToLocals; std::vector fLocalToBones; diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index ca51e86d..db5a082a 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -10619,9 +10619,9 @@ inline void inlTESTPOINT(const hsPoint3& destP, #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ ALIGN(16) float hack[4]; \ - mc0 = _mm_loadu_ps(xfm.fMap[0]); \ - mc1 = _mm_loadu_ps(xfm.fMap[1]); \ - mc2 = _mm_loadu_ps(xfm.fMap[2]); \ + mc0 = _mm_load_ps(xfm.fMap[0]); \ + mc1 = _mm_load_ps(xfm.fMap[1]); \ + mc2 = _mm_load_ps(xfm.fMap[2]); \ mwt = _mm_set_ps1(wgt); #define MATRIXMULTPOINTADD_SSE3(dst, src) \ msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \