Browse Source

Merge pull request #445 from zrax/avx2

Support AVX2
Adam Johnson 10 years ago
parent
commit
f83828d2f1
  1. 85
      Sources/Plasma/CoreLib/hsCpuID.cpp
  2. 32
      Sources/Plasma/CoreLib/hsCpuID.h
  3. 29
      Sources/Plasma/CoreLib/hsMatrix44.cpp
  4. 16
      Sources/Plasma/CoreLib/hsMatrix44.h
  5. 11
      Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp
  6. 6
      Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h

85
Sources/Plasma/CoreLib/hsCpuID.cpp

@ -59,16 +59,31 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "hsCpuID.h"
hsCpuId::hsCpuId() {
const unsigned int sse1_flag = 1<<25;
const unsigned int sse2_flag = 1<<26;
const unsigned int sse3_flag = 1<<0;
const unsigned int ssse3_flag = 1<<9;
const unsigned int sse41_flag = 1<<19;
const unsigned int sse42_flag = 1<<20;
const unsigned int avx_flag = 1<<28;
unsigned int ax = 0, bx = 0, cx = 0, dx = 0;
enum : unsigned int {
// EAX=1; EDX=:
sse1_flag = 1U<<25,
sse2_flag = 1U<<26,
// EAX=1; ECX=:
sse3_flag = 1U<<0,
ssse3_flag = 1U<<9,
sse41_flag = 1U<<19,
sse42_flag = 1U<<20,
avx_flag = 1U<<28,
// EAX=7; ECX=0; EBX=:
avx2_flag = 1U<<5
};
union RegSet {
struct {
unsigned int eax, ebx, ecx, edx;
};
int array[4];
};
RegSet CPUInfo_Features = { 0, 0, 0, 0 };
RegSet CPUInfo_Ext = { 0, 0, 0, 0 };
/**
* Portable implementation of CPUID, successfully tested with:
@ -80,33 +95,33 @@ hsCpuId::hsCpuId() {
*
* Ref: http://primesieve.googlecode.com/svn-history/r388/trunk/soe/cpuid.h
*/
#if defined(MSC_COMPATIBLE)
int CPUInfo[4] = {ax, bx, cx, dx};
__cpuid(CPUInfo, 0);
// check if the CPU supports the cpuid instruction.
if (CPUInfo[0] != 0) {
__cpuid(CPUInfo, 1);
ax = CPUInfo[0];
bx = CPUInfo[1];
cx = CPUInfo[2];
dx = CPUInfo[3];
}
#elif defined(GCC_COMPATIBLE)
__get_cpuid(1, &ax, &bx, &cx, &dx);
#endif
has_sse1 = (dx & sse1_flag) || false;
has_sse2 = (dx & sse2_flag) || false;
has_sse3 = (cx & sse3_flag) || false;
has_ssse3 = (cx & ssse3_flag) || false;
has_sse41 = (cx & sse41_flag) || false;
has_sse42 = (cx & sse42_flag) || false;
has_avx = (cx & avx_flag) || false;
#if defined(MSC_COMPATIBLE)
__cpuid(CPUInfo_Features.array, 0);
// check if the CPU supports the cpuid instruction.
if (CPUInfo_Features.eax != 0) {
__cpuid(CPUInfo_Features.array, 1);
__cpuid(CPUInfo_Ext.array, 7);
}
#elif defined(GCC_COMPATIBLE)
__get_cpuid(1, &CPUInfo_Features.eax, &CPUInfo_Features.ebx,
&CPUInfo_Features.ecx, &CPUInfo_Features.edx);
__get_cpuid(7, &CPUInfo_Ext.eax, &CPUInfo_Ext.ebx,
&CPUInfo_Ext.ecx, &CPUInfo_Ext.edx);
#endif
has_sse1 = (CPUInfo_Features.edx & sse1_flag) || false;
has_sse2 = (CPUInfo_Features.edx & sse2_flag) || false;
has_sse3 = (CPUInfo_Features.ecx & sse3_flag) || false;
has_ssse3 = (CPUInfo_Features.ecx & ssse3_flag) || false;
has_sse41 = (CPUInfo_Features.ecx & sse41_flag) || false;
has_sse42 = (CPUInfo_Features.ecx & sse42_flag) || false;
has_avx = (CPUInfo_Features.ecx & avx_flag) || false;
has_avx2 = (CPUInfo_Ext.ebx & avx2_flag) || false;
}
const hsCpuId& hsCpuId::instance()
const hsCpuId& hsCpuId::Instance()
{
static hsCpuId self;
return self;

32
Sources/Plasma/CoreLib/hsCpuID.h

@ -65,7 +65,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
// typedef float(*func_ptr)();
// static hsFunctionDispatcher<func_ptr> my_func;
//
// hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
// hsFunctionDispatcher<func_ptr> my_func(my_func_fpu, 0, 0, 0, 0, 0, 0, my_func_avx);
// my_func();
//
//////////////////////////////////////////////////////////////////////
@ -74,6 +75,12 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#ifndef hsCpuID_inc
#define hsCpuID_inc
#if defined __AVX2__ || _MSC_VER >= 1600
#define HS_AVX2
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "immintrin.h"
#endif
#endif
#if defined __AVX__ || _MSC_VER >= 1600
#define HS_AVX
#ifndef HS_SIMD_INCLUDE
@ -126,16 +133,31 @@ struct hsCpuId {
bool has_sse41;
bool has_sse42;
bool has_avx;
bool has_avx2;
hsCpuId();
static const hsCpuId& instance();
static const hsCpuId& Instance();
};
template <typename func_ptr>
struct hsFunctionDispatcher {
hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
struct hsCpuFunctionDispatcher {
hsCpuFunctionDispatcher(func_ptr fpu,
func_ptr sse1 = nullptr,
func_ptr sse2 = nullptr,
func_ptr sse3 = nullptr,
func_ptr ssse3 = nullptr,
func_ptr sse41 = nullptr,
func_ptr sse42 = nullptr,
func_ptr avx = nullptr,
func_ptr avx2 = nullptr)
{
hsAssert(fpu, "FPU fallback function required.");
const hsCpuId& cpu = hsCpuId::instance();
const hsCpuId& cpu = hsCpuId::Instance();
#ifdef HS_AVX2
if (cpu.has_avx2 && avx2) {
call = avx2;
} else
#endif
#ifdef HS_AVX
if (cpu.has_avx && avx) {
call = avx;

29
Sources/Plasma/CoreLib/hsMatrix44.cpp

@ -57,9 +57,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
static hsMatrix44 myIdent = hsMatrix44().Reset();
const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
/*
For the rotation:
¦ 2 2 ¦
@ -102,7 +99,7 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
rotate.QuatFromMatrix44(*this);
}
hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
static hsMatrix44 mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
{
hsMatrix44 c;
@ -153,7 +150,7 @@ hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
_mm_storeu_ps(c.fMap[i], xmm[1]);
#endif // HS_SSE3
hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
static hsMatrix44 mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
{
hsMatrix44 c;
#ifdef HS_SSE3
@ -201,8 +198,28 @@ hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
return c;
}
// CPU-optimized functions requiring dispatch
hsCpuFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult {
&mat_mult_fpu,
nullptr, // SSE1
nullptr, // SSE2
&mat_mult_sse3
};
hsPoint3 hsMatrix44::operator*(const hsPoint3& p) const
{
if (fFlags & hsMatrix44::kIsIdent)
return p;
hsPoint3 rVal;
rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3];
rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3];
rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3];
return rVal;
}
hsVector3 hsMatrix44::operator*(const hsVector3& p) const
{
{
if( fFlags & hsMatrix44::kIsIdent )
return p;

16
Sources/Plasma/CoreLib/hsMatrix44.h

@ -133,17 +133,7 @@ struct hsMatrix44 {
void MakeZRotation(float radians);
hsPoint3 operator*(const hsPoint3& p) const
{
if( fFlags & hsMatrix44::kIsIdent )
return p;
hsPoint3 rVal;
rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3];
rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3];
rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3];
return rVal;
}
hsPoint3 operator*(const hsPoint3& p) const;
hsVector3 operator*(const hsVector3& p) const;
hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
@ -160,9 +150,7 @@ struct hsMatrix44 {
// CPU-optimized functions
typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
static hsCpuFunctionDispatcher<mat_mult_ptr> mat_mult;
};
////////////////////////////////////////////////////////////////////////////

11
Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp

@ -10714,9 +10714,14 @@ static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMat
}
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(
IBlendVertBuffer<ISkinVertexFPU>, 0, 0, IBlendVertBuffer<ISkinVertexSSE3>, 0,
IBlendVertBuffer<ISkinVertexSSE41>);
hsCpuFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer {
&IBlendVertBuffer<ISkinVertexFPU>,
nullptr, // SSE1
nullptr, // SSE2
&IBlendVertBuffer<ISkinVertexSSE3>,
nullptr, // SSSE3
&IBlendVertBuffer<ISkinVertexSSE41>
};
// ISetPipeConsts //////////////////////////////////////////////////////////////////
// A shader can request that the pipeline fill in certain constants that are indeterminate

6
Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h

@ -798,8 +798,10 @@ public:
// CPU-optimized functions
protected:
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *,
uint8_t , uint32_t, uint8_t *, uint32_t,
uint32_t, uint16_t);
static hsCpuFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
};

Loading…
Cancel
Save