Browse Source

Simple SSE3 skinning algorithm

Future direction: do lots of work such that hsMatrix44 and hsScalarTriple
are 16-byte aligned so that we can use faster aligned loads
Adam Johnson 13 years ago
parent
commit
bb47f83cf3
  1. 6
      CMakeLists.txt
  2. 28
      Sources/Plasma/Apps/plClient/winmain.cpp
  3. 6
      Sources/Plasma/CoreLib/hsUtils.h
  4. 80
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

6
CMakeLists.txt

@ -84,6 +84,12 @@ if(MSVC)
add_definitions(-D_SCL_SECURE_NO_WARNINGS) add_definitions(-D_SCL_SECURE_NO_WARNINGS)
endif(MSVC) endif(MSVC)
# TODO: Maybe some kind of automated test here?
option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
if(PLASMA_USE_SSE)
add_definitions(-DHAVE_SSE)
endif(PLASMA_USE_SSE)
#TODO: Make the OpenSSL includes less promiscuous so this isn't needed #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
include_directories(${OPENSSL_INCLUDE_DIR}) include_directories(${OPENSSL_INCLUDE_DIR})

28
Sources/Plasma/Apps/plClient/winmain.cpp

@ -49,6 +49,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <dmdfm.h> // Windows Load EXE into memory suff #include <dmdfm.h> // Windows Load EXE into memory suff
#endif #endif
#ifdef HAVE_SSE
# include <intrin.h>
#endif
#include <curl/curl.h> #include <curl/curl.h>
#include "HeadSpin.h" #include "HeadSpin.h"
@ -1441,11 +1445,35 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
return EXCEPTION_EXECUTE_HANDLER; return EXCEPTION_EXECUTE_HANDLER;
} }
bool CheckCPU()
{
const unsigned int sse3_flag = 0x00000001;
// (any other CPU features...)
int cpu_info[4];
__cpuid(cpu_info, 1);
#ifdef HAVE_SSE
if(cpu_info[2] & sse3_flag == 0)
return false;
#endif
// Insert additional feature checks here
return true;
}
#include "pfConsoleCore/pfConsoleEngine.h" #include "pfConsoleCore/pfConsoleEngine.h"
PF_CONSOLE_LINK_ALL() PF_CONSOLE_LINK_ALL()
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow) int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
{ {
// Check to make sure we have a good CPU before getting started
if (!CheckCPU())
{
plString msg = plString::Format("Your processor does not support all of the features required to play %S", ProductLongName());
hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
return PARABLE_NORMAL_EXIT;
}
PF_CONSOLE_INIT_ALL() PF_CONSOLE_INIT_ALL()
// Set global handle // Set global handle

6
Sources/Plasma/CoreLib/hsUtils.h

@ -177,6 +177,12 @@ inline float hsRadiansToDegrees(float rad) { return float(rad * (180 / M_PI)); }
#include <new> #include <new>
#define NEWZERO(t) new(calloc(sizeof(t), 1)) t #define NEWZERO(t) new(calloc(sizeof(t), 1)) t
#ifdef _MSC_VER
# define ALIGN(n) __declspec(align(n))
#else
# define ALIGN(n) __atribute__(aligned(n))
#endif
///////////////////////////// /////////////////////////////
// Physical memory functions // Physical memory functions
///////////////////////////// /////////////////////////////

80
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -163,6 +163,10 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <algorithm> #include <algorithm>
#ifdef HAVE_SSE
# include <smmintrin.h>
#endif
//#define MF_TOSSER //#define MF_TOSSER
int mfCurrentTest = 100; int mfCurrentTest = 100;
@ -10523,23 +10527,61 @@ void plDXPipeline::LoadResources()
// Sorry about this, but it really did speed up the skinning. // Sorry about this, but it really did speed up the skinning.
// Just some macros for the inner loop of IBlendVertsIntoBuffer. // Just some macros for the inner loop of IBlendVertsIntoBuffer.
#define MATRIXMULTBEGIN(xfm, wgt) \ #ifdef HAVE_SSE
register float m00 = xfm.fMap[0][0]; \ # define MATRIXMULTBEGIN(xfm, wgt) \
register float m01 = xfm.fMap[0][1]; \ __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
register float m02 = xfm.fMap[0][2]; \ ALIGN(16) float hack[4]; \
register float m03 = xfm.fMap[0][3]; \ mc0 = _mm_loadu_ps(xfm.fMap[0]); \
register float m10 = xfm.fMap[1][0]; \ mc1 = _mm_loadu_ps(xfm.fMap[1]); \
register float m11 = xfm.fMap[1][1]; \ mc2 = _mm_loadu_ps(xfm.fMap[2]); \
register float m12 = xfm.fMap[1][2]; \ mwt = _mm_set_ps1(wgt);
register float m13 = xfm.fMap[1][3]; \ # define MATRIXMULTPOINTADD(dst, src) \
register float m20 = xfm.fMap[2][0]; \ msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
register float m21 = xfm.fMap[2][1]; \ _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
register float m22 = xfm.fMap[2][2]; \ _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
register float m23 = xfm.fMap[2][3]; \ _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
register float m_wgt = wgt; \ \
register float srcX, srcY, srcZ; hbuf = _mm_hadd_ps(_x, _y); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
#define MATRIXMULTPOINTADD(dst, src) \ _mm_store_ps(hack, hbuf); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fZ += hack[0];
# define MATRIXMULTVECTORADD(dst, src) \
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf = _mm_hadd_ps(_x, _y); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fZ += hack[0];
#else
# define MATRIXMULTBEGIN(xfm, wgt) \
float m00 = xfm.fMap[0][0]; \
float m01 = xfm.fMap[0][1]; \
float m02 = xfm.fMap[0][2]; \
float m03 = xfm.fMap[0][3]; \
float m10 = xfm.fMap[1][0]; \
float m11 = xfm.fMap[1][1]; \
float m12 = xfm.fMap[1][2]; \
float m13 = xfm.fMap[1][3]; \
float m20 = xfm.fMap[2][0]; \
float m21 = xfm.fMap[2][1]; \
float m22 = xfm.fMap[2][2]; \
float m23 = xfm.fMap[2][3]; \
float m_wgt = wgt; \
float srcX, srcY, srcZ;
# define MATRIXMULTPOINTADD(dst, src) \
srcX = src.fX; \ srcX = src.fX; \
srcY = src.fY; \ srcY = src.fY; \
srcZ = src.fZ; \ srcZ = src.fZ; \
@ -10547,8 +10589,7 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
# define MATRIXMULTVECTORADD(dst, src) \
#define MATRIXMULTVECTORADD(dst, src) \
srcX = src.fX; \ srcX = src.fX; \
srcY = src.fY; \ srcY = src.fY; \
srcZ = src.fZ; \ srcZ = src.fZ; \
@ -10556,6 +10597,7 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
#endif // HAVE_SSE
// inlTESTPOINT ///////////////////////////////////////// // inlTESTPOINT /////////////////////////////////////////
// Update mins and maxs if destP is outside. // Update mins and maxs if destP is outside.

Loading…
Cancel
Save