diff --git a/CMakeLists.txt b/CMakeLists.txt index c042e895..11abb4c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,12 +84,6 @@ if(MSVC) add_definitions(-D_SCL_SECURE_NO_WARNINGS) endif(MSVC) -# TODO: Maybe some kind of automated test here? -option(PLASMA_USE_SSE "Enable SSE optimizations?" ON) -if(PLASMA_USE_SSE) - add_definitions(-DHAVE_SSE) -endif(PLASMA_USE_SSE) - #TODO: Make the OpenSSL includes less promiscuous so this isn't needed include_directories(${OPENSSL_INCLUDE_DIR}) diff --git a/Sources/Plasma/Apps/plClient/winmain.cpp b/Sources/Plasma/Apps/plClient/winmain.cpp index 9870afde..985a73f0 100644 --- a/Sources/Plasma/Apps/plClient/winmain.cpp +++ b/Sources/Plasma/Apps/plClient/winmain.cpp @@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include // Windows Load EXE into memory suff #endif -#ifdef HAVE_SSE -# include -#endif - #include #include "HeadSpin.h" @@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except } #endif -bool CheckCPU() -{ - const unsigned int sse3_flag = 0x00000001; - // (any other CPU features...) - - int cpu_info[4]; - __cpuid(cpu_info, 1); -#ifdef HAVE_SSE - if((cpu_info[2] & sse3_flag) == 0) - return false; -#endif - // Insert additional feature checks here - - return true; -} - #include "pfConsoleCore/pfConsoleEngine.h" PF_CONSOLE_LINK_ALL() int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow) { - // Check to make sure we have a good CPU before getting started - if (!CheckCPU()) - { - plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName()); - hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError); - return PARABLE_NORMAL_EXIT; - } - PF_CONSOLE_INIT_ALL() // Set global handle diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt index e4cf8278..d99906de 100644 --- a/Sources/Plasma/CoreLib/CMakeLists.txt +++ b/Sources/Plasma/CoreLib/CMakeLists.txt @@ -15,6 +15,7 @@ set(CoreLib_SOURCES HeadSpin.cpp hsBitVector.cpp hsBounds.cpp + hsCpuID.cpp hsCritSect.cpp hsExceptionStack.cpp hsFastMath.cpp @@ -57,6 +58,7 @@ set(CoreLib_HEADERS hsBitVector.h hsBounds.h hsColorRGBA.h + hsCpuID.h hsCritSect.h hsExceptions.h hsFastMath.h diff --git a/Sources/Plasma/CoreLib/hsCpuID.cpp b/Sources/Plasma/CoreLib/hsCpuID.cpp new file mode 100644 index 00000000..12c99f9b --- /dev/null +++ b/Sources/Plasma/CoreLib/hsCpuID.cpp @@ -0,0 +1,71 @@ +/*==LICENSE==* + +CyanWorlds.com Engine - MMOG client, server and tools +Copyright (C) 2011 Cyan Worlds, Inc. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Additional permissions under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK, +NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent +JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK +(or a modified version of those libraries), +containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA, +PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG +JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the +licensors of this Program grant you additional +permission to convey the resulting work. Corresponding Source for a +non-source form of such a combination shall include the source code for +the parts of OpenSSL and IJG JPEG Library used as well as that of the covered +work. + +You can contact Cyan Worlds, Inc. by email legal@cyan.com + or by snail mail at: + Cyan Worlds, Inc. + 14617 N Newport Hwy + Mead, WA 99021 + +*==LICENSE==*/ + +#include + +#include "hsCpuID.h" + +hsCpuId::hsCpuId() { + const unsigned int sse1_flag = 1<<25; + const unsigned int sse2_flag = 1<<26; + const unsigned int sse3_flag = 1<<0; + const unsigned int ssse3_flag = 1<<9; + const unsigned int sse41_flag = 1<<19; + const unsigned int sse42_flag = 1<<20; + const unsigned int avx_flag = 1 << 28; + + unsigned int cpu_info[4]; + __cpuid((int*)cpu_info, 1); + has_sse1 = (cpu_info[3] & sse1_flag) || false; + has_sse2 = (cpu_info[3] & sse2_flag) || false; + has_sse3 = (cpu_info[2] & sse3_flag) || false; + has_ssse3 = (cpu_info[2] & ssse3_flag) || false; + has_sse41 = (cpu_info[2] & sse41_flag) || false; + has_sse42 = (cpu_info[2] & sse42_flag) || false; + has_avx = (cpu_info[2] & avx_flag) || false; +} + +const hsCpuId& hsCpuId::instance() +{ + static hsCpuId self; + return self; +} diff --git a/Sources/Plasma/CoreLib/hsCpuID.h b/Sources/Plasma/CoreLib/hsCpuID.h new file mode 100644 index 00000000..6c5f8385 --- /dev/null +++ b/Sources/Plasma/CoreLib/hsCpuID.h @@ -0,0 +1,182 @@ +/*==LICENSE==* + +CyanWorlds.com Engine - MMOG client, server and tools +Copyright (C) 2011 Cyan Worlds, Inc. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Additional permissions under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK, +NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent +JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK +(or a modified version of those libraries), +containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA, +PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG +JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the +licensors of this Program grant you additional +permission to convey the resulting work. Corresponding Source for a +non-source form of such a combination shall include the source code for +the parts of OpenSSL and IJG JPEG Library used as well as that of the covered +work. + +You can contact Cyan Worlds, Inc. by email legal@cyan.com + or by snail mail at: + Cyan Worlds, Inc. + 14617 N Newport Hwy + Mead, WA 99021 + +*==LICENSE==*/ + +////////////////////////////////////////////////////////////////////// +// +// hsCpuID - Processor feature detection and function dispatcher +// +// +// == Example Usage == +// +// #ifdef HS_SIMD_INCLUDE +// # include HS_SIMD_INCLUDE +// #endif +// +// float my_func_fpu() { +// ... +// } +// +// float my_func_avx() { +// #ifdef HS_AVX +// ... +// #endif +// } +// +// +// typedef float(*func_ptr)(); +// static hsFunctionDispatcher my_func; +// +// hsFunctionDispatcher float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx); +// +////////////////////////////////////////////////////////////////////// + + + +#ifndef hsCpuID_inc +#define hsCpuID_inc + +#if defined __AVX__ || _MSC_VER >= 1600 +#define HS_AVX +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "immintrin.h" +#endif +#endif +#if defined __SSE4_2__ || _MSC_VER >= 1600 +#define HS_SSE42 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "nmmintrin.h" +#endif +#endif +#if defined __SSE4_1__ || _MSC_VER >= 1600 +#define HS_SSE41 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "smmintrin.h" +#endif +#endif +#if defined __SSSE3__ || _MSC_VER >= 1600 +#define HS_SSSE3 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "tmmintrin.h" +#endif +#endif +#if defined __SSE3__ || _MSC_VER >= 1400 +#define HS_SSE3 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "pmmintrin.h" +#endif +#endif +#if defined __SSE2__ || _MSC_VER >= 1300 +#define HS_SSE2 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "emmintrin.h" +#endif +#endif +#if defined __SSE__ || _MSC_VER >= 1300 +#define HS_SSE1 +#ifndef HS_SIMD_INCLUDE +# define HS_SIMD_INCLUDE "xmmintrin.h" +#endif +#endif + + +struct hsCpuId { + bool has_sse1; + bool has_sse2; + bool has_sse3; + bool has_ssse3; + bool has_sse41; + bool has_sse42; + bool has_avx; + + hsCpuId(); + static const hsCpuId& instance(); +}; + +template +struct hsFunctionDispatcher { + hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) { + hsAssert(fpu, "FPU fallback function required."); + const hsCpuId& cpu = hsCpuId::instance(); +#ifdef HS_AVX + if (cpu.has_avx && avx) { + call = avx; + } else +#endif +#ifdef HS_SSE42 + if (cpu.has_sse42 && sse42) { + call = sse42; + } else +#endif +#ifdef HS_SSE41 + if (cpu.has_sse41 && sse41) { + call = sse41; + } else +#endif +#ifdef HS_SSSE3 + if (cpu.has_ssse3 && ssse3) { + call = ssse3; + } else +#endif +#ifdef HS_SSE3 + if (cpu.has_sse3 && sse3) { + call = sse3; + } else +#endif +#ifdef HS_SSE2 + if (cpu.has_sse2 && sse2) { + call = sse2; + } else +#endif +#ifdef HS_SSE1 + if (cpu.has_sse1 && sse1) { + call = sse1; + } else +#endif + { + call = fpu; + } + }; + func_ptr call; +}; + + +#endif // hsCpuID_inc \ No newline at end of file diff --git a/Sources/Plasma/CoreLib/hsMatrix44.cpp b/Sources/Plasma/CoreLib/hsMatrix44.cpp index 9c4053f3..75bb0883 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.cpp +++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp @@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "hsStream.h" #include -#ifdef HAVE_SSE -# include +#ifdef HS_SIMD_INCLUDE +# include HS_SIMD_INCLUDE #endif static hsMatrix44 myIdent = hsMatrix44().Reset(); const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3); + /* For the rotation: ¦ 2 2 ¦ @@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const rotate.QuatFromMatrix44(*this); } -#ifdef HAVE_SSE +hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b) +{ + hsMatrix44 c; + + if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent ) + { + c.Reset(); + return c; + } + + if( a.fFlags & hsMatrix44::kIsIdent ) + return b; + if( b.fFlags & hsMatrix44::kIsIdent ) + return a; + + c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]); + c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]); + c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]); + c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]); + + c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]); + c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]); + c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]); + c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]); + + c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]); + c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]); + c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]); + c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]); + + c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]); + c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]); + c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]); + c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]); + + return c; +} + +#ifdef HS_SSE3 # define MULTBEGIN(i) \ - xmm[0] = _mm_loadu_ps(fMap[i]); + xmm[0] = _mm_loadu_ps(a.fMap[i]); # define MULTCELL(i, j) \ xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \ xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]); @@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \ xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \ _mm_storeu_ps(c.fMap[i], xmm[1]); -#endif +#endif // HS_SSE3 -hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const +hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b) { hsMatrix44 c; - - if( fFlags & b.fFlags & hsMatrix44::kIsIdent ) +#ifdef HS_SSE3 + if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent ) { c.Reset(); return c; } - if( fFlags & hsMatrix44::kIsIdent ) + if( a.fFlags & hsMatrix44::kIsIdent ) return b; if( b.fFlags & hsMatrix44::kIsIdent ) - return *this; + return a; -#ifdef HAVE_SSE __m128 xmm[8]; MULTBEGIN(0); @@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const MULTCELL(3, 2); MULTCELL(3, 3); MULTFINISH(3); -#else - c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]); - c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]); - c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]); - c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]); - - c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]); - c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]); - c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]); - c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]); - - c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]); - c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]); - c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]); - c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]); - - c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]); - c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]); - c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]); - c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]); -#endif - +#endif // HS_SSE3 return c; } diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h index 7804631d..c0a5ed21 100644 --- a/Sources/Plasma/CoreLib/hsMatrix44.h +++ b/Sources/Plasma/CoreLib/hsMatrix44.h @@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include "HeadSpin.h" #include "hsGeometry3.h" +#include "hsCpuID.h" class hsQuat; @@ -104,7 +105,7 @@ struct hsMatrix44 { const hsVector3* up); hsBool GetParity() const; - float GetDeterminant() const; + float GetDeterminant() const; hsMatrix44* GetInverse(hsMatrix44* inverse) const; hsMatrix44* GetTranspose(hsMatrix44* inverse) const; hsMatrix44* GetAdjoint(hsMatrix44* adjoint) const; @@ -140,7 +141,7 @@ struct hsMatrix44 { return rVal; } hsVector3 operator*(const hsVector3& p) const; - hsMatrix44 operator*(const hsMatrix44& b) const; + hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); } hsPoint3* MapPoints(long count, hsPoint3 points[]) const; @@ -152,6 +153,12 @@ struct hsMatrix44 { void Read(hsStream *stream); void Write(hsStream *stream); + + // CPU-optimized functions + typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&); + static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&); + static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&); + static hsFunctionDispatcher mat_mult; }; //////////////////////////////////////////////////////////////////////////// diff --git a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp index 1c64314d..878acdc0 100644 --- a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp +++ b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp @@ -380,7 +380,6 @@ plProfile_CreateTimer(" CIRecalcT", "Object", CIRecalcT); plProfile_CreateTimer(" CIDirtyT", "Object", CIDirtyT); plProfile_CreateTimer(" CISetT", "Object", CISetT); -#ifndef HAVE_SSE static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) { hsMatrix44 ret; @@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r return ret; } -#endif // HAVE_SSE void plCoordinateInterface::IRecalcTransforms() { @@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms() plProfile_BeginTiming(CIRecalcT); if( fParent ) { -#ifdef HAVE_SSE - fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent; - fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal(); -#else fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent); fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal()); -#endif } else { diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp index 3fc19b2a..369dd94e 100644 --- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp +++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp @@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const } //// SetTransform //////////////////////////////////////////////////////////// -#ifndef HAVE_SSE static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) { hsMatrix44 ret; @@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r return ret; } -#endif #ifdef MF_TEST_UPDATE plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans); @@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w #endif // MF_TEST_UPDATE for( i = 0; i < spans->GetCount(); i++ ) { -#ifdef HAVE_SSE - fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ]; - fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l; -#else fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]); fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l); -#endif // HAVE_SSE + } #ifdef MF_TEST_UPDATE plProfile_EndTiming(DSMatTransT); diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp index 6c93dbbd..9bc0373f 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp @@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com #include -#ifdef HAVE_SSE -# include +#ifdef HS_SIMD_INCLUDE +# include HS_SIMD_INCLUDE #endif //#define MF_TOSSER @@ -10525,48 +10525,35 @@ void plDXPipeline::LoadResources() plNetClientApp::StaticDebugMsg("End Device Reload"); } -// Sorry about this, but it really did speed up the skinning. -// Just some macros for the inner loop of IBlendVertsIntoBuffer. -#ifdef HAVE_SSE -# define MATRIXMULTBEGIN(xfm, wgt) \ - __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \ - ALIGN(16) float hack[4]; \ - mc0 = _mm_loadu_ps(xfm.fMap[0]); \ - mc1 = _mm_loadu_ps(xfm.fMap[1]); \ - mc2 = _mm_loadu_ps(xfm.fMap[2]); \ - mwt = _mm_set_ps1(wgt); -# define MATRIXMULTPOINTADD(dst, src) \ - msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf = _mm_hadd_ps(_x, _y); \ - hbuf = _mm_hadd_ps(hbuf, hbuf); \ - _mm_store_ps(hack, hbuf); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - hbuf = _mm_hadd_ps(_z, _z); \ - hbuf = _mm_hadd_ps(hbuf, hbuf); \ - _mm_store_ps(hack, hbuf); \ - dst.fZ += hack[0]; -# define MATRIXMULTVECTORADD(dst, src) \ - msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ - _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ - _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ - _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ - \ - hbuf = _mm_hadd_ps(_x, _y); \ - hbuf = _mm_hadd_ps(hbuf, hbuf); \ - _mm_store_ps(hack, hbuf); \ - dst.fX += hack[0]; \ - dst.fY += hack[1]; \ - hbuf = _mm_hadd_ps(_z, _z); \ - hbuf = _mm_hadd_ps(hbuf, hbuf); \ - _mm_store_ps(hack, hbuf); \ - dst.fZ += hack[0]; -#else -# define MATRIXMULTBEGIN(xfm, wgt) \ +// inlTESTPOINT ///////////////////////////////////////// +// Update mins and maxs if destP is outside. +inline void inlTESTPOINT(const hsPoint3& destP, + float& minX, float& minY, float& minZ, + float& maxX, float& maxY, float& maxZ) +{ + if( destP.fX < minX ) + minX = destP.fX; + else if( destP.fX > maxX ) + maxX = destP.fX; + + if( destP.fY < minY ) + minY = destP.fY; + else if( destP.fY > maxY ) + maxY = destP.fY; + + if( destP.fZ < minZ ) + minZ = destP.fZ; + else if( destP.fZ > maxZ ) + maxZ = destP.fZ; +} + +//// IBlendVertsIntoBuffer //////////////////////////////////////////////////// +// Given a pointer into a buffer of verts that have blending data in the D3D +// format, blends them into the destination buffer given without the blending +// info. + +// FPU version +#define MATRIXMULTBEGIN_FPU(xfm, wgt) \ float m00 = xfm.fMap[0][0]; \ float m01 = xfm.fMap[0][1]; \ float m02 = xfm.fMap[0][2]; \ @@ -10581,7 +10568,7 @@ void plDXPipeline::LoadResources() float m23 = xfm.fMap[2][3]; \ float m_wgt = wgt; \ float srcX, srcY, srcZ; -# define MATRIXMULTPOINTADD(dst, src) \ +#define MATRIXMULTPOINTADD_FPU(dst, src) \ srcX = src.fX; \ srcY = src.fY; \ srcZ = src.fZ; \ @@ -10589,7 +10576,7 @@ void plDXPipeline::LoadResources() dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; -# define MATRIXMULTVECTORADD(dst, src) \ +#define MATRIXMULTVECTORADD_FPU(dst, src) \ srcX = src.fX; \ srcY = src.fY; \ srcZ = src.fZ; \ @@ -10597,218 +10584,250 @@ void plDXPipeline::LoadResources() dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; -#endif // HAVE_SSE - -// inlTESTPOINT ///////////////////////////////////////// -// Update mins and maxs if destP is outside. -inline void inlTESTPOINT(const hsPoint3& destP, - float& minX, float& minY, float& minZ, - float& maxX, float& maxY, float& maxZ) -{ - if( destP.fX < minX ) - minX = destP.fX; - else if( destP.fX > maxX ) - maxX = destP.fX; - - if( destP.fY < minY ) - minY = destP.fY; - else if( destP.fY > maxY ) - maxY = destP.fY; - - if( destP.fZ < minZ ) - minZ = destP.fZ; - else if( destP.fZ > maxZ ) - maxZ = destP.fZ; -} - -//// IBlendVertsIntoBuffer //////////////////////////////////////////////////// -// Given a pointer into a buffer of verts that have blending data in the D3D -// format, blends them into the destination buffer given without the blending -// info. - -void plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, - hsMatrix44* matrixPalette, int numMatrices, - const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, - uint16_t localUVWChans ) -{ - uint8_t numUVs, numWeights; - uint32_t i, j, indices, color, specColor, uvChanSize; - float weights[ 4 ], weightSum; - hsPoint3 pt, tempPt, destPt; - hsVector3 vec, tempNorm, destNorm; - - - /// Get some counts - switch( format & plGBufferGroup::kSkinWeightMask ) - { - case plGBufferGroup::kSkin1Weight: numWeights = 1; break; - case plGBufferGroup::kSkin2Weights: numWeights = 2; break; - case plGBufferGroup::kSkin3Weights: numWeights = 3; break; - default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); - } - - numUVs = plGBufferGroup::CalcNumUVs( format ); - uvChanSize = numUVs * sizeof( float ) * 3; -//#define MF_RECALC_BOUNDS -#ifdef MF_RECALC_BOUNDS - float minX = 1.e33f; - float minY = 1.e33f; - float minZ = 1.e33f; - - float maxX = -1.e33f; - float maxY = -1.e33f; - float maxZ = -1.e33f; -#endif // MF_RECALC_BOUNDS +// SSE3 version +#ifdef HS_SSE3 +#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \ + __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \ + ALIGN(16) float hack[4]; \ + mc0 = _mm_loadu_ps(xfm.fMap[0]); \ + mc1 = _mm_loadu_ps(xfm.fMap[1]); \ + mc2 = _mm_loadu_ps(xfm.fMap[2]); \ + mwt = _mm_set_ps1(wgt); +#define MATRIXMULTPOINTADD_SSE3(dst, src) \ + msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf1 = _mm_hadd_ps(_x, _y); \ + hbuf2 = _mm_hadd_ps(_z, _z); \ + hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ + _mm_store_ps(hack, hbuf1); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + dst.fZ += hack[2]; +#define MATRIXMULTVECTORADD_SSE3(dst, src) \ + msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \ + _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ + _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ + _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ + \ + hbuf1 = _mm_hadd_ps(_x, _y); \ + hbuf2 = _mm_hadd_ps(_z, _z); \ + hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \ + _mm_store_ps(hack, hbuf1); \ + dst.fX += hack[0]; \ + dst.fY += hack[1]; \ + dst.fZ += hack[2]; +#endif - // localUVWChans is bump mapping tangent space vectors, which need to +// CPU-optimized functions requiring dispatch +hsFunctionDispatcher plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3); + +// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication +#define BLENDVERTSTART \ + uint8_t numUVs, numWeights; \ + uint32_t i, j, indices, color, specColor, uvChanSize; \ + float weights[ 4 ], weightSum; \ + hsPoint3 pt, tempPt, destPt; \ + hsVector3 vec, tempNorm, destNorm; \ + \ + /* Get some counts */\ + switch( format & plGBufferGroup::kSkinWeightMask ) \ + { \ + case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \ + case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \ + case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \ + default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \ + } \ + \ + numUVs = plGBufferGroup::CalcNumUVs( format ); \ + uvChanSize = numUVs * sizeof( float ) * 3; \ + \ + /* localUVWChans is bump mapping tangent space vectors, which need to // be skinned like the normal, as opposed to passed through like // garden variety UVW coordinates. // There are no localUVWChans that I know of in production assets (i.e. - // the avatar is not skinned). - if( !localUVWChans ) - { - /// Copy whilst blending - for( i = 0; i < count; i++ ) - { - // Extract data - src = inlExtractPoint( src, pt ); - for( j = 0, weightSum = 0; j < numWeights; j++ ) - { - src = inlExtractFloat( src, weights[ j ] ); - weightSum += weights[ j ]; - } - weights[ j ] = 1 - weightSum; - - if( format & plGBufferGroup::kSkinIndices ) - { - src = inlExtractUInt32( src, indices ); - } - else - { - indices = 1 << 8; - } - src = inlExtractPoint( src, vec ); - src = inlExtractUInt32( src, color ); - src = inlExtractUInt32( src, specColor ); - - // Blend - destPt.Set( 0, 0, 0 ); - destNorm.Set( 0, 0, 0 ); - for( j = 0; j < numWeights + 1; j++ ) - { - if( weights[ j ] ) + // the avatar is not skinned).*/\ + if( !localUVWChans ) \ + { \ + /* Copy whilst blending */\ + for( i = 0; i < count; i++ ) \ + { \ + /* Extract data */\ + src = inlExtractPoint( src, pt ); \ + for( j = 0, weightSum = 0; j < numWeights; j++ ) \ + { \ + src = inlExtractFloat( src, weights[ j ] ); \ + weightSum += weights[ j ]; \ + } \ + weights[ j ] = 1 - weightSum; \ + \ + if( format & plGBufferGroup::kSkinIndices ) \ + { \ + src = inlExtractUInt32( src, indices ); \ + } \ + else \ + { \ + indices = 1 << 8; \ + } \ + src = inlExtractPoint( src, vec ); \ + src = inlExtractUInt32( src, color ); \ + src = inlExtractUInt32( src, specColor ); \ + \ + /* Blend */\ + destPt.Set( 0, 0, 0 ); \ + destNorm.Set( 0, 0, 0 ); \ + for( j = 0; j < numWeights + 1; j++ ) \ + { \ + if( weights[ j ] ) \ { + /* MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTVECTORADD(destNorm, vec); - } - - indices >>= 8; - } - // Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". -// hsFastMath::NormalizeAppr(destNorm); - -#ifdef MF_RECALC_BOUNDS - inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); -#endif // MF_RECALC_BOUNDS - - // Slam data into position now - dest = inlStuffPoint( dest, destPt ); - dest = inlStuffPoint( dest, destNorm ); - dest = inlStuffUInt32( dest, color ); - dest = inlStuffUInt32( dest, specColor ); - memcpy( dest, src, uvChanSize ); - src += uvChanSize; - dest += uvChanSize; - } - } - else - { - uint8_t hiChan = localUVWChans >> 8; - uint8_t loChan = localUVWChans & 0xff; - /// Copy whilst blending - for( i = 0; i < count; i++ ) - { - hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; - hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; - - // Extract data - src = inlExtractPoint( src, pt ); - for( j = 0, weightSum = 0; j < numWeights; j++ ) - { - src = inlExtractFloat( src, weights[ j ] ); - weightSum += weights[ j ]; - } - weights[ j ] = 1 - weightSum; - - if( format & plGBufferGroup::kSkinIndices ) - { - src = inlExtractUInt32( src, indices ); - } - else - { - indices = 1 << 8; - } - - src = inlExtractPoint( src, vec ); - src = inlExtractUInt32( src, color ); - src = inlExtractUInt32( src, specColor ); - - uint8_t k; - for( k = 0; k < numUVs; k++ ) - { - src = inlExtractPoint( src, srcUVWs[k] ); - } - memcpy( dstUVWs, srcUVWs, uvChanSize); - dstUVWs[loChan].Set(0,0,0); - dstUVWs[hiChan].Set(0,0,0); - - // Blend - destPt.Set( 0, 0, 0 ); - destNorm.Set( 0, 0, 0 ); - for( j = 0; j < numWeights + 1; j++ ) - { - if( weights[ j ] ) - { + */ +#define BLENDVERTMID \ + } \ + \ + indices >>= 8; \ + } \ + /* Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth".*/\ + /* hsFastMath::NormalizeAppr(destNorm);*/ \ + \ + /* Slam data into position now */\ + dest = inlStuffPoint( dest, destPt ); \ + dest = inlStuffPoint( dest, destNorm ); \ + dest = inlStuffUInt32( dest, color ); \ + dest = inlStuffUInt32( dest, specColor ); \ + memcpy( dest, src, uvChanSize ); \ + src += uvChanSize; \ + dest += uvChanSize; \ + } \ + } \ + else \ + { \ + uint8_t hiChan = localUVWChans >> 8; \ + uint8_t loChan = localUVWChans & 0xff; \ + /* Copy whilst blending */\ + for( i = 0; i < count; i++ ) \ + { \ + hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \ + hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \ + \ + /* Extract data */\ + src = inlExtractPoint( src, pt ); \ + for( j = 0, weightSum = 0; j < numWeights; j++ ) \ + { \ + src = inlExtractFloat( src, weights[ j ] ); \ + weightSum += weights[ j ]; \ + } \ + weights[ j ] = 1 - weightSum; \ + \ + if( format & plGBufferGroup::kSkinIndices ) \ + { \ + src = inlExtractUInt32( src, indices ); \ + } \ + else \ + { \ + indices = 1 << 8; \ + } \ + \ + src = inlExtractPoint( src, vec ); \ + src = inlExtractUInt32( src, color ); \ + src = inlExtractUInt32( src, specColor ); \ + \ + uint8_t k; \ + for( k = 0; k < numUVs; k++ ) \ + { \ + src = inlExtractPoint( src, srcUVWs[k] ); \ + } \ + memcpy( dstUVWs, srcUVWs, uvChanSize); \ + dstUVWs[loChan].Set(0,0,0); \ + dstUVWs[hiChan].Set(0,0,0); \ + \ + /* Blend */\ + destPt.Set( 0, 0, 0 ); \ + destNorm.Set( 0, 0, 0 ); \ + for( j = 0; j < numWeights + 1; j++ ) \ + { \ + if( weights[ j ] ) \ + { \ + /* MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTVECTORADD(destNorm, vec); MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]); - } - - indices >>= 8; - } - // Probably don't really need to renormalize this. There errors are - // going to be subtle and "smooth". -// hsFastMath::NormalizeAppr(destNorm); -// hsFastMath::NormalizeAppr(dstUVWs[loChan]); -// hsFastMath::NormalizeAppr(dstUVWs[hiChan]); - -#ifdef MF_RECALC_BOUNDS - inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); -#endif // MF_RECALC_BOUNDS - - // Slam data into position now - dest = inlStuffPoint( dest, destPt ); - dest = inlStuffPoint( dest, destNorm ); - dest = inlStuffUInt32( dest, color ); - dest = inlStuffUInt32( dest, specColor ); - memcpy( dest, dstUVWs, uvChanSize ); - dest += uvChanSize; - } - } -#ifdef MF_RECALC_BOUNDS - hsBounds3Ext wBnd; - wBnd.Reset(&hsPoint3(minX, minY, minZ)); - wBnd.Union(&hsPoint3(maxX, maxY, maxZ)); - span->fWorldBounds = wBnd; -#endif // MF_RECALC_BOUNDS + */ +#define BLENDVERTEND \ + } \ + \ + indices >>= 8; \ + } \ + /* Probably don't really need to renormalize this. There errors are + // going to be subtle and "smooth". */\ + /* hsFastMath::NormalizeAppr(destNorm); */\ + /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\ + /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\ + \ + /* Slam data into position now */\ + dest = inlStuffPoint( dest, destPt ); \ + dest = inlStuffPoint( dest, destNorm ); \ + dest = inlStuffUInt32( dest, color ); \ + dest = inlStuffUInt32( dest, specColor ); \ + memcpy( dest, dstUVWs, uvChanSize ); \ + dest += uvChanSize; \ + } \ + } + +void plDXPipeline::blend_vert_buffer_fpu( plSpan* span, + hsMatrix44* matrixPalette, int numMatrices, + const uint8_t *src, uint8_t format, uint32_t srcStride, + uint8_t *dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans ) +{ + BLENDVERTSTART + MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_FPU(destPt, pt); + MATRIXMULTVECTORADD_FPU(destNorm, vec); + BLENDVERTMID + MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_FPU(destPt, pt); + MATRIXMULTVECTORADD_FPU(destNorm, vec); + MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]); + MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]); + + BLENDVERTEND +} + +void plDXPipeline::blend_vert_buffer_sse3( plSpan* span, + hsMatrix44* matrixPalette, int numMatrices, + const uint8_t *src, uint8_t format, uint32_t srcStride, + uint8_t *dest, uint32_t destStride, uint32_t count, + uint16_t localUVWChans ) +{ +#ifdef HS_SSE3 + BLENDVERTSTART + MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_SSE3(destPt, pt); + MATRIXMULTVECTORADD_SSE3(destNorm, vec); + BLENDVERTMID + MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]); + + MATRIXMULTPOINTADD_SSE3(destPt, pt); + MATRIXMULTVECTORADD_SSE3(destNorm, vec); + MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]); + MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]); + BLENDVERTEND +#endif // HS_SSE3 } // ISetPipeConsts ////////////////////////////////////////////////////////////////// diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h index cdc4e2e4..1d83fce9 100644 --- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h +++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h @@ -465,7 +465,8 @@ protected: void IBlendVertsIntoBuffer( plSpan* span, hsMatrix44* matrixPalette, int numMatrices, const uint8_t *src, uint8_t format, uint32_t srcStride, - uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans ); + uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans ) + { blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); }; hsBool ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray& visList ); @@ -734,7 +735,7 @@ public: virtual void GetDepth(float& hither, float& yon) const; virtual void SetDepth(float hither, float yon); - virtual float GetZBiasScale() const; + virtual float GetZBiasScale() const; virtual void SetZBiasScale(float scale); virtual const hsMatrix44& GetWorldToCamera() const; @@ -798,6 +799,13 @@ public: virtual int GetMaxAnisotropicSamples(); virtual int GetMaxAntiAlias(int Width, int Height, int ColorDepth); + + // CPU-optimized functions +protected: + typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t); + static hsFunctionDispatcher blend_vert_buffer; };