From 19bda88893e3713b80270c5aa35becc4b63271e6 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Sat, 24 May 2014 18:48:17 -0700 Subject: [PATCH 1/2] Add AVX2 support to hsCpuID, and cleanup some misc related junk --- Sources/Plasma/CoreLib/hsCpuID.cpp | 85 +++++++++++-------- Sources/Plasma/CoreLib/hsCpuID.h | 29 ++++++- Sources/Plasma/CoreLib/hsMatrix44.cpp | 29 +++++-- Sources/Plasma/CoreLib/hsMatrix44.h | 16 +--- .../PubUtilLib/plPipeline/DX/plDXPipeline.cpp | 11 ++- .../PubUtilLib/plPipeline/DX/plDXPipeline.h | 6 +- 6 files changed, 112 insertions(+), 64 deletions(-) diff --git a/Sources/Plasma/CoreLib/hsCpuID.cpp b/Sources/Plasma/CoreLib/hsCpuID.cpp index 021a76a6..414627b5 100644 --- a/Sources/Plasma/CoreLib/hsCpuID.cpp +++ b/Sources/Plasma/CoreLib/hsCpuID.cpp @@ -59,16 +59,31 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "hsCpuID.h" hsCpuId::hsCpuId() { - const unsigned int sse1_flag = 1<<25; - const unsigned int sse2_flag = 1<<26; - const unsigned int sse3_flag = 1<<0; - const unsigned int ssse3_flag = 1<<9; - const unsigned int sse41_flag = 1<<19; - const unsigned int sse42_flag = 1<<20; - const unsigned int avx_flag = 1<<28; - - unsigned int ax = 0, bx = 0, cx = 0, dx = 0; - + enum : unsigned int { + // EAX=1; EDX=: + sse1_flag = 1U<<25, + sse2_flag = 1U<<26, + + // EAX=1; ECX=: + sse3_flag = 1U<<0, + ssse3_flag = 1U<<9, + sse41_flag = 1U<<19, + sse42_flag = 1U<<20, + avx_flag = 1U<<28, + + // EAX=7; ECX=0; EBX=: + avx2_flag = 1U<<5 + }; + + union RegSet { + struct { + unsigned int eax, ebx, ecx, edx; + }; + int array[4]; + }; + + RegSet CPUInfo_Features = { 0, 0, 0, 0 }; + RegSet CPUInfo_Ext = { 0, 0, 0, 0 }; /** * Portable implementation of CPUID, successfully tested with: @@ -80,33 +95,33 @@ hsCpuId::hsCpuId() { * * Ref: http://primesieve.googlecode.com/svn-history/r388/trunk/soe/cpuid.h */ - #if defined(MSC_COMPATIBLE) - int CPUInfo[4] = {ax, bx, cx, dx}; - __cpuid(CPUInfo, 0); - - // check if the CPU supports the cpuid instruction. - if (CPUInfo[0] != 0) { - __cpuid(CPUInfo, 1); - ax = CPUInfo[0]; - bx = CPUInfo[1]; - cx = CPUInfo[2]; - dx = CPUInfo[3]; - } - #elif defined(GCC_COMPATIBLE) - __get_cpuid(1, &ax, &bx, &cx, &dx); - #endif - - - has_sse1 = (dx & sse1_flag) || false; - has_sse2 = (dx & sse2_flag) || false; - has_sse3 = (cx & sse3_flag) || false; - has_ssse3 = (cx & ssse3_flag) || false; - has_sse41 = (cx & sse41_flag) || false; - has_sse42 = (cx & sse42_flag) || false; - has_avx = (cx & avx_flag) || false; +#if defined(MSC_COMPATIBLE) + __cpuid(CPUInfo_Features.array, 0); + + // check if the CPU supports the cpuid instruction. + if (CPUInfo_Features.eax != 0) { + __cpuid(CPUInfo_Features.array, 1); + __cpuid(CPUInfo_Ext.array, 7); + } +#elif defined(GCC_COMPATIBLE) + __get_cpuid(1, &CPUInfo_Features.eax, &CPUInfo_Features.ebx, + &CPUInfo_Features.ecx, &CPUInfo_Features.edx); + __get_cpuid(7, &CPUInfo_Ext.eax, &CPUInfo_Ext.ebx, + &CPUInfo_Ext.ecx, &CPUInfo_Ext.edx); +#endif + + + has_sse1 = (CPUInfo_Features.edx & sse1_flag) || false; + has_sse2 = (CPUInfo_Features.edx & sse2_flag) || false; + has_sse3 = (CPUInfo_Features.ecx & sse3_flag) || false; + has_ssse3 = (CPUInfo_Features.ecx & ssse3_flag) || false; + has_sse41 = (CPUInfo_Features.ecx & sse41_flag) || false; + has_sse42 = (CPUInfo_Features.ecx & sse42_flag) || false; + has_avx = (CPUInfo_Features.ecx & avx_flag) || false; + has_avx2 = (CPUInfo_Ext.ebx & avx2_flag) || false; } -const hsCpuId& hsCpuId::instance() +const hsCpuId& hsCpuId::Instance() { static hsCpuId self; return self; diff --git a/Sources/Plasma/CoreLib/hsCpuID.h b/Sources/Plasma/CoreLib/hsCpuID.h index 6c5f8385..b6e9ca1c 100644 --- a/Sources/Plasma/CoreLib/hsCpuID.h +++ b/Sources/Plasma/CoreLib/hsCpuID.h @@ -74,6 +74,12 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #ifndef hsCpuID_inc #define hsCpuID_inc +#if defined __AVX2__ || _MSC_VER >= 1600 +#define HS_AVX2 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "immintrin.h" +#endif +#endif #if defined __AVX__ || _MSC_VER >= 1600 #define HS_AVX #ifndef HS_SIMD_INCLUDE @@ -126,16 +132,31 @@ struct hsCpuId { bool has_sse41; bool has_sse42; bool has_avx; + bool has_avx2; hsCpuId(); - static const hsCpuId& instance(); + static const hsCpuId& Instance(); }; template -struct hsFunctionDispatcher { - hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) { +struct hsCpuFunctionDispatcher { + hsCpuFunctionDispatcher(func_ptr fpu, + func_ptr sse1 = nullptr, + func_ptr sse2 = nullptr, + func_ptr sse3 = nullptr, + func_ptr ssse3 = nullptr, + func_ptr sse41 = nullptr, + func_ptr sse42 = nullptr, + func_ptr avx = nullptr, + func_ptr avx2 = nullptr) + { hsAssert(fpu, "FPU fallback function required."); - const hsCpuId& cpu = hsCpuId::instance(); + const hsCpuId& cpu = hsCpuId::Instance(); +#ifdef HS_AVX2 + if (cpu.has_avx2 && avx2) { + call = avx2; + } else +#endif #ifdef HS_AVX if (cpu.has_avx && avx) { call = avx; diff --git a/Sources/Plasma/CoreLib/hsMatrix44.cpp b/Sources/Plasma/CoreLib/hsMatrix44.cpp index 7270aa9e..a5881f5f 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.cpp +++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp @@ -57,9 +57,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com static hsMatrix44 myIdent = hsMatrix44().Reset(); const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } -// CPU-optimized functions requiring dispatch -hsFunctionDispatcher hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3); - /* For the rotation: ¦ 2 2 ¦ @@ -102,7 +99,7 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const rotate.QuatFromMatrix44(*this); } -hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) +static hsMatrix44 mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) { hsMatrix44 c; @@ -153,7 +150,7 @@ hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) _mm_storeu_ps(c.fMap[i], xmm[1]); #endif // HS_SSE3 -hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) +static hsMatrix44 mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) { hsMatrix44 c; #ifdef HS_SSE3 @@ -201,8 +198,28 @@ hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) return c; } +// CPU-optimized functions requiring dispatch +hsCpuFunctionDispatcher hsMatrix44::mat_mult { + &mat_mult_fpu, + nullptr, // SSE1 + nullptr, // SSE2 + &mat_mult_sse3 +}; + +hsPoint3 hsMatrix44::operator*(const hsPoint3& p) const +{ + if (fFlags & hsMatrix44::kIsIdent) + return p; + + hsPoint3 rVal; + rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3]; + rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3]; + rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3]; + return rVal; +} + hsVector3 hsMatrix44::operator*(const hsVector3& p) const -{ +{ if( fFlags & hsMatrix44::kIsIdent ) return p; diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h index 346a3ed7..d9a588e7 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.h +++ b/Sources/Plasma/CoreLib/hsMatrix44.h @@ -133,17 +133,7 @@ struct hsMatrix44 { void MakeZRotation(float radians); - hsPoint3 operator*(const hsPoint3& p) const - { - if( fFlags & hsMatrix44::kIsIdent ) - return p; - - hsPoint3 rVal; - rVal.fX = (p.fX * fMap[0][0]) + (p.fY * fMap[0][1]) + (p.fZ * fMap[0][2]) + fMap[0][3]; - rVal.fY = (p.fX * fMap[1][0]) + (p.fY * fMap[1][1]) + (p.fZ * fMap[1][2]) + fMap[1][3]; - rVal.fZ = (p.fX * fMap[2][0]) + (p.fY * fMap[2][1]) + (p.fZ * fMap[2][2]) + fMap[2][3]; - return rVal; - } + hsPoint3 operator*(const hsPoint3& p) const; hsVector3 operator*(const hsVector3& p) const; hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); } @@ -160,9 +150,7 @@ struct hsMatrix44 { // CPU-optimized functions typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&); - static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&); - static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&); - static hsFunctionDispatcher mat_mult; + static hsCpuFunctionDispatcher mat_mult; }; //////////////////////////////////////////////////////////////////////////// diff --git a/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp index 733297a1..ba58b8e2 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.cpp @@ -10714,9 +10714,14 @@ static void IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette, int numMat } // CPU-optimized functions requiring dispatch -hsFunctionDispatcher plDXPipeline::blend_vert_buffer( - IBlendVertBuffer, 0, 0, IBlendVertBuffer, 0, - IBlendVertBuffer); +hsCpuFunctionDispatcher plDXPipeline::blend_vert_buffer { + &IBlendVertBuffer, + nullptr, // SSE1 + nullptr, // SSE2 + &IBlendVertBuffer, + nullptr, // SSSE3 + &IBlendVertBuffer +}; // ISetPipeConsts ////////////////////////////////////////////////////////////////// // A shader can request that the pipeline fill in certain constants that are indeterminate diff --git a/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h index 46d40412..0f3c8047 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h +++ b/Sources/Plasma/PubUtilLib/plPipeline/DX/plDXPipeline.h @@ -798,8 +798,10 @@ public: // CPU-optimized functions protected: - typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); - static hsFunctionDispatcher blend_vert_buffer; + typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, + uint8_t , uint32_t, uint8_t *, uint32_t, + uint32_t, uint16_t); + static hsCpuFunctionDispatcher blend_vert_buffer; }; From 2bbfa9399d5e1a080201a941d746e083d44c5068 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Fri, 8 Aug 2014 21:47:19 -0700 Subject: [PATCH 2/2] Fixup example --- Sources/Plasma/CoreLib/hsCpuID.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Sources/Plasma/CoreLib/hsCpuID.h b/Sources/Plasma/CoreLib/hsCpuID.h index b6e9ca1c..79aa1e82 100644 --- a/Sources/Plasma/CoreLib/hsCpuID.h +++ b/Sources/Plasma/CoreLib/hsCpuID.h @@ -65,7 +65,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com // typedef float(*func_ptr)(); // static hsFunctionDispatcher my_func; // -// hsFunctionDispatcher float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx); +// hsFunctionDispatcher my_func(my_func_fpu, 0, 0, 0, 0, 0, 0, my_func_avx); +// my_func(); // //////////////////////////////////////////////////////////////////////