Browse Source

Aligned matrix loading

Adam Johnson 12 years ago
parent
commit
48f232c3a0
  1. 1
      Sources/Plasma/CoreLib/CMakeLists.txt
  2. 129
      Sources/Plasma/CoreLib/hsAlignedAllocator.hpp
  3. 6
      Sources/Plasma/CoreLib/hsMatrix44.h
  4. 4
      Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h
  5. 6
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

1
Sources/Plasma/CoreLib/CMakeLists.txt

@ -74,6 +74,7 @@ endif(UNIX)
set(CoreLib_HEADERS set(CoreLib_HEADERS
HeadSpin.h HeadSpin.h
hsAlignedAllocator.hpp
hsBiExpander.h hsBiExpander.h
hsBitVector.h hsBitVector.h
hsBounds.h hsBounds.h

129
Sources/Plasma/CoreLib/hsAlignedAllocator.hpp

@ -0,0 +1,129 @@
/*==LICENSE==*
CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011 Cyan Worlds, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional permissions under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.
You can contact Cyan Worlds, Inc. by email legal@cyan.com
or by snail mail at:
Cyan Worlds, Inc.
14617 N Newport Hwy
Mead, WA 99021
*==LICENSE==*/
#ifndef _HS_ALIGNED_ALLOCATOR_H
#define _HS_ALIGNED_ALLOCATOR_H
#include "HeadSpin.h"
template<class T, size_t ALIGNMENT=16>
/**
* An aligned allocator for storing SIMD ready values in STL containers
* \remarks Based on https://gist.github.com/donny-dont/1471329
*/
class hsAlignedAllocator
{
hsAlignedAllocator& operator=(const hsAlignedAllocator&) { }
public:
template <typename U, size_t ALIGNMENT=16>
struct rebind
{
typedef hsAlignedAllocator<U, ALIGNMENT> other;
};
typedef T* pointer;
typedef const T* const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef T value_type;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
hsAlignedAllocator() { }
hsAlignedAllocator(const hsAlignedAllocator&) { }
template <typename U> hsAlignedAllocator(const hsAlignedAllocator<U, ALIGNMENT>&) { }
~hsAlignedAllocator() { }
pointer address(reference r) const { return &r; }
const_pointer address(const_reference r) const { return &r; }
pointer allocate(size_type size, const_pointer hint=nullptr)
{
if (size == 0)
return nullptr;
if (size > max_size())
throw std::length_error("integer overflow");
#ifdef HS_BUILD_FOR_WIN32
void* ptr = _aligned_malloc(size * sizeof(value_type), ALIGNMENT);
#else
void* ptr = nullptr;
posix_memalign(&ptr, ALIGNMENT, size * sizeof(value_type));
#endif // HS_BUILD_FOR_WIN32
if (!ptr)
throw std::bad_alloc();
return static_cast<pointer>(ptr);
}
void construct(T* const p, const_reference t) const
{
void * const pv = static_cast<void *>(p);
new (pv) value_type(t);
}
void deallocate(pointer ptr, size_type size)
{
#ifdef HS_BUILD_FOR_WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif // HS_BUILD_FOR_WIN32
}
void destroy(T* const p) const
{
p->~T();
}
size_type max_size() const
{
return static_cast<size_t>(-1) / sizeof(value_type);
}
bool operator==(const hsAlignedAllocator& other) const { return true; }
};
#endif // _HS_ALIGNED_ALLOCATOR_H

6
Sources/Plasma/CoreLib/hsMatrix44.h

@ -61,7 +61,11 @@ struct hsMatrix44 {
kView kView
}; };
float fMap[4][4]; float fMap[4][4];
uint32_t fFlags; union
{
uint8_t alignment[16];
uint32_t fFlags;
};
hsMatrix44() : fFlags(0) {} hsMatrix44() : fFlags(0) {}
hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate); hsMatrix44(const hsScalarTriple &translate, const hsQuat &rotate);

4
Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.h

@ -63,7 +63,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#ifndef _plDrawableSpans_h #ifndef _plDrawableSpans_h
#define _plDrawableSpans_h #define _plDrawableSpans_h
#include "hsAlignedAllocator.hpp"
#include "hsBitVector.h" #include "hsBitVector.h"
#include "hsTemplates.h" #include "hsTemplates.h"
#include "plDrawable.h" #include "plDrawable.h"
@ -132,7 +132,7 @@ class plDrawableSpans : public plDrawable
hsMatrix44 fLocalToWorld; hsMatrix44 fLocalToWorld;
hsMatrix44 fWorldToLocal; hsMatrix44 fWorldToLocal;
std::vector<hsMatrix44> fLocalToWorlds; std::vector<hsMatrix44, hsAlignedAllocator<hsMatrix44>> fLocalToWorlds; // used in SIMD skinning
std::vector<hsMatrix44> fWorldToLocals; std::vector<hsMatrix44> fWorldToLocals;
std::vector<hsMatrix44> fLocalToBones; std::vector<hsMatrix44> fLocalToBones;

6
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -10619,9 +10619,9 @@ inline void inlTESTPOINT(const hsPoint3& destP,
#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
ALIGN(16) float hack[4]; \ ALIGN(16) float hack[4]; \
mc0 = _mm_loadu_ps(xfm.fMap[0]); \ mc0 = _mm_load_ps(xfm.fMap[0]); \
mc1 = _mm_loadu_ps(xfm.fMap[1]); \ mc1 = _mm_load_ps(xfm.fMap[1]); \
mc2 = _mm_loadu_ps(xfm.fMap[2]); \ mc2 = _mm_load_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt); mwt = _mm_set_ps1(wgt);
#define MATRIXMULTPOINTADD_SSE3(dst, src) \ #define MATRIXMULTPOINTADD_SSE3(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \

Loading…
Cancel
Save