Browse Source

Merge pull request #445 from zrax/avx2

Support AVX2
Adam Johnson 10 years ago
parent
commit
f83828d2f1
  1. 85
      Sources/Plasma/CoreLib/hsCpuID.cpp
  2. 32
      Sources/Plasma/CoreLib/hsCpuID.h
  3. 29
      Sources/Plasma/CoreLib/hsMatrix44.cpp
  4. 16
      Sources/Plasma/CoreLib/hsMatrix44.h
  5. 11
      Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp
  6. 6
      Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h

85
Sources/Plasma/CoreLib/hsCpuID.cpp

@ -59,16 +59,31 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "hsCpuID.h" #include "hsCpuID.h"
hsCpuId::hsCpuId() { hsCpuId::hsCpuId() {
const unsigned int sse1_flag = 1<<25; enum : unsigned int {
const unsigned int sse2_flag = 1<<26; // EAX=1; EDX=:
const unsigned int sse3_flag = 1<<0; sse1_flag = 1U<<25,
const unsigned int ssse3_flag = 1<<9; sse2_flag = 1U<<26,
const unsigned int sse41_flag = 1<<19;
const unsigned int sse42_flag = 1<<20; // EAX=1; ECX=:
const unsigned int avx_flag = 1<<28; sse3_flag = 1U<<0,
ssse3_flag = 1U<<9,
unsigned int ax = 0, bx = 0, cx = 0, dx = 0; sse41_flag = 1U<<19,
sse42_flag = 1U<<20,
avx_flag = 1U<<28,
// EAX=7; ECX=0; EBX=:
avx2_flag = 1U<<5
};
union RegSet {
struct {
unsigned int eax, ebx, ecx, edx;
};
int array[4];
};
RegSet CPUInfo_Features = { 0, 0, 0, 0 };
RegSet CPUInfo_Ext = { 0, 0, 0, 0 };
/** /**
* Portable implementation of CPUID, successfully tested with: * Portable implementation of CPUID, successfully tested with:
@ -80,33 +95,33 @@ hsCpuId::hsCpuId() {
* *
* Ref: http://primesieve.googlecode.com/svn-history/r388/trunk/soe/cpuid.h * Ref: http://primesieve.googlecode.com/svn-history/r388/trunk/soe/cpuid.h
*/ */
#if defined(MSC_COMPATIBLE) #if defined(MSC_COMPATIBLE)
int CPUInfo[4] = {ax, bx, cx, dx}; __cpuid(CPUInfo_Features.array, 0);
__cpuid(CPUInfo, 0);
// check if the CPU supports the cpuid instruction.
// check if the CPU supports the cpuid instruction. if (CPUInfo_Features.eax != 0) {
if (CPUInfo[0] != 0) { __cpuid(CPUInfo_Features.array, 1);
__cpuid(CPUInfo, 1); __cpuid(CPUInfo_Ext.array, 7);
ax = CPUInfo[0]; }
bx = CPUInfo[1]; #elif defined(GCC_COMPATIBLE)
cx = CPUInfo[2]; __get_cpuid(1, &CPUInfo_Features.eax, &CPUInfo_Features.ebx,
dx = CPUInfo[3]; &CPUInfo_Features.ecx, &CPUInfo_Features.edx);
} __get_cpuid(7, &CPUInfo_Ext.eax, &CPUInfo_Ext.ebx,
#elif defined(GCC_COMPATIBLE) &CPUInfo_Ext.ecx, &CPUInfo_Ext.edx);
__get_cpuid(1, &ax, &bx, &cx, &dx); #endif
#endif
has_sse1 = (CPUInfo_Features.edx & sse1_flag) || false;
has_sse1 = (dx & sse1_flag) || false; has_sse2 = (CPUInfo_Features.edx & sse2_flag) || false;
has_sse2 = (dx & sse2_flag) || false; has_sse3 = (CPUInfo_Features.ecx & sse3_flag) || false;
has_sse3 = (cx & sse3_flag) || false; has_ssse3 = (CPUInfo_Features.ecx & ssse3_flag) || false;
has_ssse3 = (cx & ssse3_flag) || false; has_sse41 = (CPUInfo_Features.ecx & sse41_flag) || false;
has_sse41 = (cx & sse41_flag) || false; has_sse42 = (CPUInfo_Features.ecx & sse42_flag) || false;
has_sse42 = (cx & sse42_flag) || false; has_avx = (CPUInfo_Features.ecx & avx_flag) || false;
has_avx = (cx & avx_flag) || false; has_avx2 = (CPUInfo_Ext.ebx & avx2_flag) || false;
} }
const hsCpuId& hsCpuId::instance() const hsCpuId& hsCpuId::Instance()
{ {
static hsCpuId self; static hsCpuId self;
return self; return self;

32
Sources/Plasma/CoreLib/hsCpuID.h

@ -65,7 +65,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
// typedef float(*func_ptr)(); // typedef float(*func_ptr)();
// static hsFunctionDispatcher<func_ptr> my_func; // static hsFunctionDispatcher<func_ptr> my_func;
// //
// hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx); // hsFunctionDispatcher<func_ptr> my_func(my_func_fpu, 0, 0, 0, 0, 0, 0, my_func_avx);
// my_func();
// //
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -74,6 +75,12 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#ifndef hsCpuID_inc #ifndef hsCpuID_inc
#define hsCpuID_inc #define hsCpuID_inc
#if defined __AVX2__ || _MSC_VER >= 1600
#define HS_AVX2
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "immintrin.h"
#endif
#endif
#if defined __AVX__ || _MSC_VER >= 1600 #if defined __AVX__ || _MSC_VER >= 1600
#define HS_AVX #define HS_AVX
#ifndef HS_SIMD_INCLUDE #ifndef HS_SIMD_INCLUDE
@ -126,16 +133,31 @@ struct hsCpuId {
bool has_sse41; bool has_sse41;
bool has_sse42; bool has_sse42;
bool has_avx; bool has_avx;
bool has_avx2;
hsCpuId(); hsCpuId();
static const hsCpuId& instance(); static const hsCpuId& Instance();
}; };
template <typename func_ptr> template <typename func_ptr>
struct hsFunctionDispatcher { struct hsCpuFunctionDispatcher {
hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) { hsCpuFunctionDispatcher(func_ptr fpu,
func_ptr sse1 = nullptr,
func_ptr sse2 = nullptr,
func_ptr sse3 = nullptr,
func_ptr ssse3 = nullptr,
func_ptr sse41 = nullptr,
func_ptr sse42 = nullptr,
func_ptr avx = nullptr,
func_ptr avx2 = nullptr)
{
hsAssert(fpu, "FPU fallback function required."); hsAssert(fpu, "FPU fallback function required.");
const hsCpuId& cpu = hsCpuId::instance(); const hsCpuId& cpu = hsCpuId::Instance();
#ifdef HS_AVX2
if (cpu.has_avx2 && avx2) {
call = avx2;
} else
#endif
#ifdef HS_AVX #ifdef HS_AVX
if (cpu.has_avx && avx) { if (cpu.has_avx && avx) {
call = avx; call = avx;

29
Sources/Plasma/CoreLib/hsMatrix44.cpp

@ -57,9 +57,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
static hsMatrix44 myIdent = hsMatrix44().Reset(); static hsMatrix44 myIdent = hsMatrix44().Reset();
const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
/* /*
For the rotation: For the rotation:
¦ 2 2 ¦ ¦ 2 2 ¦
@ -102,7 +99,7 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
rotate.QuatFromMatrix44(*this); rotate.QuatFromMatrix44(*this);
} }
hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) static hsMatrix44 mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
{ {
hsMatrix44 c; hsMatrix44 c;
@ -153,7 +150,7 @@ hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
_mm_storeu_ps(c.fMap[i], xmm[1]); _mm_storeu_ps(c.fMap[i], xmm[1]);
#endif // HS_SSE3 #endif // HS_SSE3
hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) static hsMatrix44 mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
{ {
hsMatrix44 c; hsMatrix44 c;
#ifdef HS_SSE3 #ifdef HS_SSE3
@ -201,8 +198,28 @@ hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
return c; return c;
} }
// CPU-optimized functions requiring dispatch
hsCpuFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult {
&mat_mult_fpu,
nullptr, // SSE1
nullptr, // SSE2
&mat_mult_sse3
};
hsPoint3 hsMatrix44::operator*(const hsPoint3& p) const
{
if (fFlags & hsMatrix44::kIsIdent)
return p;
hsPoint3 rVal;
rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3];
rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3];
rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3];
return rVal;
}
hsVector3 hsMatrix44::operator*(const hsVector3& p) const hsVector3 hsMatrix44::operator*(const hsVector3& p) const
{ {
if( fFlags & hsMatrix44::kIsIdent ) if( fFlags & hsMatrix44::kIsIdent )
return p; return p;

16
Sources/Plasma/CoreLib/hsMatrix44.h

@ -133,17 +133,7 @@ struct hsMatrix44 {
void MakeZRotation(float radians); void MakeZRotation(float radians);
hsPoint3 operator*(const hsPoint3& p) const hsPoint3 operator*(const hsPoint3& p) const;
{
if( fFlags & hsMatrix44::kIsIdent )
return p;
hsPoint3 rVal;
rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3];
rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3];
rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3];
return rVal;
}
hsVector3 operator*(const hsVector3& p) const; hsVector3 operator*(const hsVector3& p) const;
hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); } hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
@ -160,9 +150,7 @@ struct hsMatrix44 {
// CPU-optimized functions // CPU-optimized functions
typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&); typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&); static hsCpuFunctionDispatcher<mat_mult_ptr> mat_mult;
static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
}; };
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////

11
Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp

@ -10714,9 +10714,14 @@ static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMat
} }
// CPU-optimized functions requiring dispatch // CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer( hsCpuFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer {
IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>, 0, &IBlendVertBuffer<ISkinVertexFPU>,
IBlendVertBuffer<ISkinVertexSSE41>); nullptr, // SSE1
nullptr, // SSE2
&IBlendVertBuffer<ISkinVertexSSE3>,
nullptr, // SSSE3
&IBlendVertBuffer<ISkinVertexSSE41>
};
// ISetPipeConsts ////////////////////////////////////////////////////////////////// // ISetPipeConsts //////////////////////////////////////////////////////////////////
// A shader can request that the pipeline fill in certain constants that are indeterminate // A shader can request that the pipeline fill in certain constants that are indeterminate

6
Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h

@ -798,8 +798,10 @@ public:
// CPU-optimized functions // CPU-optimized functions
protected: protected:
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *,
static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer; uint8_t , uint32_t, uint8_t *, uint32_t,
uint32_t, uint16_t);
static hsCpuFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
}; };

Loading…
Cancel
Save