Browse Source

Merge pull request #202 from Deledrius/hsCpuID

Add CPU feature detection and function dispatcher
Branan Purvine-Riley 13 years ago
parent
commit
bde67949ac
  1. 6
      CMakeLists.txt
  2. 28
      Sources/Plasma/Apps/plClient/winmain.cpp
  3. 2
      Sources/Plasma/CoreLib/CMakeLists.txt
  4. 71
      Sources/Plasma/CoreLib/hsCpuID.cpp
  5. 182
      Sources/Plasma/CoreLib/hsCpuID.h
  6. 85
      Sources/Plasma/CoreLib/hsMatrix44.cpp
  7. 9
      Sources/Plasma/CoreLib/hsMatrix44.h
  8. 7
      Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
  9. 8
      Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
  10. 495
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
  11. 10
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

6
CMakeLists.txt

@ -84,12 +84,6 @@ if(MSVC)
add_definitions(-D_SCL_SECURE_NO_WARNINGS) add_definitions(-D_SCL_SECURE_NO_WARNINGS)
endif(MSVC) endif(MSVC)
# TODO: Maybe some kind of automated test here?
option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
if(PLASMA_USE_SSE)
add_definitions(-DHAVE_SSE)
endif(PLASMA_USE_SSE)
#TODO: Make the OpenSSL includes less promiscuous so this isn't needed #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
include_directories(${OPENSSL_INCLUDE_DIR}) include_directories(${OPENSSL_INCLUDE_DIR})

28
Sources/Plasma/Apps/plClient/winmain.cpp

@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <dmdfm.h> // Windows Load EXE into memory suff #include <dmdfm.h> // Windows Load EXE into memory suff
#endif #endif
#ifdef HAVE_SSE
# include <intrin.h>
#endif
#include <curl/curl.h> #include <curl/curl.h>
#include "HeadSpin.h" #include "HeadSpin.h"
@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
} }
#endif #endif
bool CheckCPU()
{
const unsigned int sse3_flag = 0x00000001;
// (any other CPU features...)
int cpu_info[4];
__cpuid(cpu_info, 1);
#ifdef HAVE_SSE
if((cpu_info[2] & sse3_flag) == 0)
return false;
#endif
// Insert additional feature checks here
return true;
}
#include "pfConsoleCore/pfConsoleEngine.h" #include "pfConsoleCore/pfConsoleEngine.h"
PF_CONSOLE_LINK_ALL() PF_CONSOLE_LINK_ALL()
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow) int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
{ {
// Check to make sure we have a good CPU before getting started
if (!CheckCPU())
{
plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName());
hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
return PARABLE_NORMAL_EXIT;
}
PF_CONSOLE_INIT_ALL() PF_CONSOLE_INIT_ALL()
// Set global handle // Set global handle

2
Sources/Plasma/CoreLib/CMakeLists.txt

@ -15,6 +15,7 @@ set(CoreLib_SOURCES
HeadSpin.cpp HeadSpin.cpp
hsBitVector.cpp hsBitVector.cpp
hsBounds.cpp hsBounds.cpp
hsCpuID.cpp
hsCritSect.cpp hsCritSect.cpp
hsExceptionStack.cpp hsExceptionStack.cpp
hsFastMath.cpp hsFastMath.cpp
@ -57,6 +58,7 @@ set(CoreLib_HEADERS
hsBitVector.h hsBitVector.h
hsBounds.h hsBounds.h
hsColorRGBA.h hsColorRGBA.h
hsCpuID.h
hsCritSect.h hsCritSect.h
hsExceptions.h hsExceptions.h
hsFastMath.h hsFastMath.h

71
Sources/Plasma/CoreLib/hsCpuID.cpp

@ -0,0 +1,71 @@
/*==LICENSE==*
CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011 Cyan Worlds, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional permissions under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.
You can contact Cyan Worlds, Inc. by email legal@cyan.com
or by snail mail at:
Cyan Worlds, Inc.
14617 N Newport Hwy
Mead, WA 99021
*==LICENSE==*/
#include <intrin.h>
#include "hsCpuID.h"
hsCpuId::hsCpuId() {
const unsigned int sse1_flag = 1<<25;
const unsigned int sse2_flag = 1<<26;
const unsigned int sse3_flag = 1<<0;
const unsigned int ssse3_flag = 1<<9;
const unsigned int sse41_flag = 1<<19;
const unsigned int sse42_flag = 1<<20;
const unsigned int avx_flag = 1 << 28;
unsigned int cpu_info[4];
__cpuid((int*)cpu_info, 1);
has_sse1 = (cpu_info[3] & sse1_flag) || false;
has_sse2 = (cpu_info[3] & sse2_flag) || false;
has_sse3 = (cpu_info[2] & sse3_flag) || false;
has_ssse3 = (cpu_info[2] & ssse3_flag) || false;
has_sse41 = (cpu_info[2] & sse41_flag) || false;
has_sse42 = (cpu_info[2] & sse42_flag) || false;
has_avx = (cpu_info[2] & avx_flag) || false;
}
const hsCpuId& hsCpuId::instance()
{
static hsCpuId self;
return self;
}

182
Sources/Plasma/CoreLib/hsCpuID.h

@ -0,0 +1,182 @@
/*==LICENSE==*
CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011 Cyan Worlds, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional permissions under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.
You can contact Cyan Worlds, Inc. by email legal@cyan.com
or by snail mail at:
Cyan Worlds, Inc.
14617 N Newport Hwy
Mead, WA 99021
*==LICENSE==*/
//////////////////////////////////////////////////////////////////////
//
// hsCpuID - Processor feature detection and function dispatcher
//
//
// == Example Usage ==
//
// #ifdef HS_SIMD_INCLUDE
// # include HS_SIMD_INCLUDE
// #endif
//
// float my_func_fpu() {
// ...
// }
//
// float my_func_avx() {
// #ifdef HS_AVX
// ...
// #endif
// }
//
//
// typedef float(*func_ptr)();
// static hsFunctionDispatcher<func_ptr> my_func;
//
// hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
//
//////////////////////////////////////////////////////////////////////
#ifndef hsCpuID_inc
#define hsCpuID_inc
#if defined __AVX__ || _MSC_VER >= 1600
#define HS_AVX
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "immintrin.h"
#endif
#endif
#if defined __SSE4_2__ || _MSC_VER >= 1600
#define HS_SSE42
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "nmmintrin.h"
#endif
#endif
#if defined __SSE4_1__ || _MSC_VER >= 1600
#define HS_SSE41
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "smmintrin.h"
#endif
#endif
#if defined __SSSE3__ || _MSC_VER >= 1600
#define HS_SSSE3
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "tmmintrin.h"
#endif
#endif
#if defined __SSE3__ || _MSC_VER >= 1400
#define HS_SSE3
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "pmmintrin.h"
#endif
#endif
#if defined __SSE2__ || _MSC_VER >= 1300
#define HS_SSE2
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "emmintrin.h"
#endif
#endif
#if defined __SSE__ || _MSC_VER >= 1300
#define HS_SSE1
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "xmmintrin.h"
#endif
#endif
struct hsCpuId {
bool has_sse1;
bool has_sse2;
bool has_sse3;
bool has_ssse3;
bool has_sse41;
bool has_sse42;
bool has_avx;
hsCpuId();
static const hsCpuId& instance();
};
template <typename func_ptr>
struct hsFunctionDispatcher {
hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
hsAssert(fpu, "FPU fallback function required.");
const hsCpuId& cpu = hsCpuId::instance();
#ifdef HS_AVX
if (cpu.has_avx && avx) {
call = avx;
} else
#endif
#ifdef HS_SSE42
if (cpu.has_sse42 && sse42) {
call = sse42;
} else
#endif
#ifdef HS_SSE41
if (cpu.has_sse41 && sse41) {
call = sse41;
} else
#endif
#ifdef HS_SSSE3
if (cpu.has_ssse3 && ssse3) {
call = ssse3;
} else
#endif
#ifdef HS_SSE3
if (cpu.has_sse3 && sse3) {
call = sse3;
} else
#endif
#ifdef HS_SSE2
if (cpu.has_sse2 && sse2) {
call = sse2;
} else
#endif
#ifdef HS_SSE1
if (cpu.has_sse1 && sse1) {
call = sse1;
} else
#endif
{
call = fpu;
}
};
func_ptr call;
};
#endif // hsCpuID_inc

85
Sources/Plasma/CoreLib/hsMatrix44.cpp

@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "hsStream.h" #include "hsStream.h"
#include <math.h> #include <math.h>
#ifdef HAVE_SSE #ifdef HS_SIMD_INCLUDE
# include <smmintrin.h> # include HS_SIMD_INCLUDE
#endif #endif
static hsMatrix44 myIdent = hsMatrix44().Reset(); static hsMatrix44 myIdent = hsMatrix44().Reset();
const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; } const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
/* /*
For the rotation: For the rotation:
¦ 2 2 ¦ ¦ 2 2 ¦
@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
rotate.QuatFromMatrix44(*this); rotate.QuatFromMatrix44(*this);
} }
#ifdef HAVE_SSE hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
{
hsMatrix44 c;
if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
{
c.Reset();
return c;
}
if( a.fFlags & hsMatrix44::kIsIdent )
return b;
if( b.fFlags & hsMatrix44::kIsIdent )
return a;
c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]);
c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]);
c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]);
c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]);
c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]);
c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]);
c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]);
c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]);
c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]);
c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]);
c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]);
c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]);
c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]);
c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]);
c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]);
c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]);
return c;
}
#ifdef HS_SSE3
# define MULTBEGIN(i) \ # define MULTBEGIN(i) \
xmm[0] = _mm_loadu_ps(fMap[i]); xmm[0] = _mm_loadu_ps(a.fMap[i]);
# define MULTCELL(i, j) \ # define MULTCELL(i, j) \
xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \ xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \
xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]); xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]);
@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \ xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \
xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \ xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \
_mm_storeu_ps(c.fMap[i], xmm[1]); _mm_storeu_ps(c.fMap[i], xmm[1]);
#endif #endif // HS_SSE3
hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
{ {
hsMatrix44 c; hsMatrix44 c;
#ifdef HS_SSE3
if( fFlags & b.fFlags & hsMatrix44::kIsIdent ) if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
{ {
c.Reset(); c.Reset();
return c; return c;
} }
if( fFlags & hsMatrix44::kIsIdent ) if( a.fFlags & hsMatrix44::kIsIdent )
return b; return b;
if( b.fFlags & hsMatrix44::kIsIdent ) if( b.fFlags & hsMatrix44::kIsIdent )
return *this; return a;
#ifdef HAVE_SSE
__m128 xmm[8]; __m128 xmm[8];
MULTBEGIN(0); MULTBEGIN(0);
@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
MULTCELL(3, 2); MULTCELL(3, 2);
MULTCELL(3, 3); MULTCELL(3, 3);
MULTFINISH(3); MULTFINISH(3);
#else #endif // HS_SSE3
c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]);
c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]);
c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]);
c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]);
c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]);
c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]);
c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]);
c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]);
c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]);
c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]);
c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]);
c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]);
c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]);
c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]);
c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]);
c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]);
#endif
return c; return c;
} }

9
Sources/Plasma/CoreLib/hsMatrix44.h

@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "HeadSpin.h" #include "HeadSpin.h"
#include "hsGeometry3.h" #include "hsGeometry3.h"
#include "hsCpuID.h"
class hsQuat; class hsQuat;
@ -140,7 +141,7 @@ struct hsMatrix44 {
return rVal; return rVal;
} }
hsVector3 operator*(const hsVector3& p) const; hsVector3 operator*(const hsVector3& p) const;
hsMatrix44 operator*(const hsMatrix44& b) const; hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
hsPoint3* MapPoints(long count, hsPoint3 points[]) const; hsPoint3* MapPoints(long count, hsPoint3 points[]) const;
@ -152,6 +153,12 @@ struct hsMatrix44 {
void Read(hsStream *stream); void Read(hsStream *stream);
void Write(hsStream *stream); void Write(hsStream *stream);
// CPU-optimized functions
typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
}; };
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////

7
Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp

@ -380,7 +380,6 @@ plProfile_CreateTimer(" CIRecalcT", "Object", CIRecalcT);
plProfile_CreateTimer(" CIDirtyT", "Object", CIDirtyT); plProfile_CreateTimer(" CIDirtyT", "Object", CIDirtyT);
plProfile_CreateTimer(" CISetT", "Object", CISetT); plProfile_CreateTimer(" CISetT", "Object", CISetT);
#ifndef HAVE_SSE
static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
{ {
hsMatrix44 ret; hsMatrix44 ret;
@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
return ret; return ret;
} }
#endif // HAVE_SSE
void plCoordinateInterface::IRecalcTransforms() void plCoordinateInterface::IRecalcTransforms()
{ {
@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms()
plProfile_BeginTiming(CIRecalcT); plProfile_BeginTiming(CIRecalcT);
if( fParent ) if( fParent )
{ {
#ifdef HAVE_SSE
fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent;
fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal();
#else
fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent); fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent);
fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal()); fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal());
#endif
} }
else else
{ {

8
Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp

@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const
} }
//// SetTransform //////////////////////////////////////////////////////////// //// SetTransform ////////////////////////////////////////////////////////////
#ifndef HAVE_SSE
static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs) static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
{ {
hsMatrix44 ret; hsMatrix44 ret;
@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
return ret; return ret;
} }
#endif
#ifdef MF_TEST_UPDATE #ifdef MF_TEST_UPDATE
plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans); plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans);
@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w
#endif // MF_TEST_UPDATE #endif // MF_TEST_UPDATE
for( i = 0; i < spans->GetCount(); i++ ) for( i = 0; i < spans->GetCount(); i++ )
{ {
#ifdef HAVE_SSE
fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ];
fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l;
#else
fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]); fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]);
fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l); fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l);
#endif // HAVE_SSE
} }
#ifdef MF_TEST_UPDATE #ifdef MF_TEST_UPDATE
plProfile_EndTiming(DSMatTransT); plProfile_EndTiming(DSMatTransT);

495
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <algorithm> #include <algorithm>
#ifdef HAVE_SSE #ifdef HS_SIMD_INCLUDE
# include <smmintrin.h> # include HS_SIMD_INCLUDE
#endif #endif
//#define MF_TOSSER //#define MF_TOSSER
@ -10525,48 +10525,35 @@ void plDXPipeline::LoadResources()
plNetClientApp::StaticDebugMsg("End Device Reload"); plNetClientApp::StaticDebugMsg("End Device Reload");
} }
// Sorry about this, but it really did speed up the skinning. // inlTESTPOINT /////////////////////////////////////////
// Just some macros for the inner loop of IBlendVertsIntoBuffer. // Update mins and maxs if destP is outside.
#ifdef HAVE_SSE inline void inlTESTPOINT(const hsPoint3& destP,
# define MATRIXMULTBEGIN(xfm, wgt) \ float& minX, float& minY, float& minZ,
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \ float& maxX, float& maxY, float& maxZ)
ALIGN(16) float hack[4]; \ {
mc0 = _mm_loadu_ps(xfm.fMap[0]); \ if( destP.fX < minX )
mc1 = _mm_loadu_ps(xfm.fMap[1]); \ minX = destP.fX;
mc2 = _mm_loadu_ps(xfm.fMap[2]); \ else if( destP.fX > maxX )
mwt = _mm_set_ps1(wgt); maxX = destP.fX;
# define MATRIXMULTPOINTADD(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \ if( destP.fY < minY )
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ minY = destP.fY;
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ else if( destP.fY > maxY )
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \ maxY = destP.fY;
\
hbuf = _mm_hadd_ps(_x, _y); \ if( destP.fZ < minZ )
hbuf = _mm_hadd_ps(hbuf, hbuf); \ minZ = destP.fZ;
_mm_store_ps(hack, hbuf); \ else if( destP.fZ > maxZ )
dst.fX += hack[0]; \ maxZ = destP.fZ;
dst.fY += hack[1]; \ }
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \ //// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
_mm_store_ps(hack, hbuf); \ // Given a pointer into a buffer of verts that have blending data in the D3D
dst.fZ += hack[0]; // format, blends them into the destination buffer given without the blending
# define MATRIXMULTVECTORADD(dst, src) \ // info.
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \ // FPU version
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \ #define MATRIXMULTBEGIN_FPU(xfm, wgt) \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf = _mm_hadd_ps(_x, _y); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fZ += hack[0];
#else
# define MATRIXMULTBEGIN(xfm, wgt) \
float m00 = xfm.fMap[0][0]; \ float m00 = xfm.fMap[0][0]; \
float m01 = xfm.fMap[0][1]; \ float m01 = xfm.fMap[0][1]; \
float m02 = xfm.fMap[0][2]; \ float m02 = xfm.fMap[0][2]; \
@ -10581,7 +10568,7 @@ void plDXPipeline::LoadResources()
float m23 = xfm.fMap[2][3]; \ float m23 = xfm.fMap[2][3]; \
float m_wgt = wgt; \ float m_wgt = wgt; \
float srcX, srcY, srcZ; float srcX, srcY, srcZ;
# define MATRIXMULTPOINTADD(dst, src) \ #define MATRIXMULTPOINTADD_FPU(dst, src) \
srcX = src.fX; \ srcX = src.fX; \
srcY = src.fY; \ srcY = src.fY; \
srcZ = src.fZ; \ srcZ = src.fZ; \
@ -10589,7 +10576,7 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \ dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt; dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
# define MATRIXMULTVECTORADD(dst, src) \ #define MATRIXMULTVECTORADD_FPU(dst, src) \
srcX = src.fX; \ srcX = src.fX; \
srcY = src.fY; \ srcY = src.fY; \
srcZ = src.fZ; \ srcZ = src.fZ; \
@ -10597,218 +10584,250 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \ dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \ dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt; dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
#endif // HAVE_SSE
// inlTESTPOINT ///////////////////////////////////////// // SSE3 version
// Update mins and maxs if destP is outside. #ifdef HS_SSE3
inline void inlTESTPOINT(const hsPoint3& destP, #define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
float& minX, float& minY, float& minZ, __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
float& maxX, float& maxY, float& maxZ) ALIGN(16) float hack[4]; \
{ mc0 = _mm_loadu_ps(xfm.fMap[0]); \
if( destP.fX < minX ) mc1 = _mm_loadu_ps(xfm.fMap[1]); \
minX = destP.fX; mc2 = _mm_loadu_ps(xfm.fMap[2]); \
else if( destP.fX > maxX ) mwt = _mm_set_ps1(wgt);
maxX = destP.fX; #define MATRIXMULTPOINTADD_SSE3(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
if( destP.fY < minY ) _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
minY = destP.fY; _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
else if( destP.fY > maxY ) _z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
maxY = destP.fY; \
hbuf1 = _mm_hadd_ps(_x, _y); \
if( destP.fZ < minZ ) hbuf2 = _mm_hadd_ps(_z, _z); \
minZ = destP.fZ; hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
else if( destP.fZ > maxZ ) _mm_store_ps(hack, hbuf1); \
maxZ = destP.fZ; dst.fX += hack[0]; \
} dst.fY += hack[1]; \
dst.fZ += hack[2];
//// IBlendVertsIntoBuffer //////////////////////////////////////////////////// #define MATRIXMULTVECTORADD_SSE3(dst, src) \
// Given a pointer into a buffer of verts that have blending data in the D3D msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
// format, blends them into the destination buffer given without the blending _x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
// info. _y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
void plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, \
hsMatrix44* matrixPalette, int numMatrices, hbuf1 = _mm_hadd_ps(_x, _y); \
const uint8_t *src, uint8_t format, uint32_t srcStride, hbuf2 = _mm_hadd_ps(_z, _z); \
uint8_t *dest, uint32_t destStride, uint32_t count, hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
uint16_t localUVWChans ) _mm_store_ps(hack, hbuf1); \
{ dst.fX += hack[0]; \
uint8_t numUVs, numWeights; dst.fY += hack[1]; \
uint32_t i, j, indices, color, specColor, uvChanSize; dst.fZ += hack[2];
float weights[ 4 ], weightSum; #endif
hsPoint3 pt, tempPt, destPt;
hsVector3 vec, tempNorm, destNorm;
/// Get some counts
switch( format & plGBufferGroup::kSkinWeightMask )
{
case plGBufferGroup::kSkin1Weight: numWeights = 1; break;
case plGBufferGroup::kSkin2Weights: numWeights = 2; break;
case plGBufferGroup::kSkin3Weights: numWeights = 3; break;
default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" );
}
numUVs = plGBufferGroup::CalcNumUVs( format );
uvChanSize = numUVs * sizeof( float ) * 3;
//#define MF_RECALC_BOUNDS
#ifdef MF_RECALC_BOUNDS
float minX = 1.e33f;
float minY = 1.e33f;
float minZ = 1.e33f;
float maxX = -1.e33f; // CPU-optimized functions requiring dispatch
float maxY = -1.e33f; hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
float maxZ = -1.e33f;
#endif // MF_RECALC_BOUNDS
// localUVWChans is bump mapping tangent space vectors, which need to // Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
#define BLENDVERTSTART \
uint8_t numUVs, numWeights; \
uint32_t i, j, indices, color, specColor, uvChanSize; \
float weights[ 4 ], weightSum; \
hsPoint3 pt, tempPt, destPt; \
hsVector3 vec, tempNorm, destNorm; \
\
/* Get some counts */\
switch( format & plGBufferGroup::kSkinWeightMask ) \
{ \
case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \
case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
} \
\
numUVs = plGBufferGroup::CalcNumUVs( format ); \
uvChanSize = numUVs * sizeof( float ) * 3; \
\
/* localUVWChans is bump mapping tangent space vectors, which need to
// be skinned like the normal, as opposed to passed through like // be skinned like the normal, as opposed to passed through like
// garden variety UVW coordinates. // garden variety UVW coordinates.
// There are no localUVWChans that I know of in production assets (i.e. // There are no localUVWChans that I know of in production assets (i.e.
// the avatar is not skinned). // the avatar is not skinned).*/\
if( !localUVWChans ) if( !localUVWChans ) \
{ { \
/// Copy whilst blending /* Copy whilst blending */\
for( i = 0; i < count; i++ ) for( i = 0; i < count; i++ ) \
{ { \
// Extract data /* Extract data */\
src = inlExtractPoint( src, pt ); src = inlExtractPoint( src, pt ); \
for( j = 0, weightSum = 0; j < numWeights; j++ ) for( j = 0, weightSum = 0; j < numWeights; j++ ) \
{ { \
src = inlExtractFloat( src, weights[ j ] ); src = inlExtractFloat( src, weights[ j ] ); \
weightSum += weights[ j ]; weightSum += weights[ j ]; \
} } \
weights[ j ] = 1 - weightSum; weights[ j ] = 1 - weightSum; \
\
if( format & plGBufferGroup::kSkinIndices ) if( format & plGBufferGroup::kSkinIndices ) \
{ { \
src = inlExtractUInt32( src, indices ); src = inlExtractUInt32( src, indices ); \
} } \
else else \
{ { \
indices = 1 << 8; indices = 1 << 8; \
} } \
src = inlExtractPoint( src, vec ); src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); src = inlExtractUInt32( src, specColor ); \
\
// Blend /* Blend */\
destPt.Set( 0, 0, 0 ); destPt.Set( 0, 0, 0 ); \
destNorm.Set( 0, 0, 0 ); destNorm.Set( 0, 0, 0 ); \
for( j = 0; j < numWeights + 1; j++ ) for( j = 0; j < numWeights + 1; j++ ) \
{ { \
if( weights[ j ] ) if( weights[ j ] ) \
{ {
/*
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec); MATRIXMULTVECTORADD(destNorm, vec);
} */
#define BLENDVERTMID \
indices >>= 8; } \
} \
// Probably don't really need to renormalize this. There errors are indices >>= 8; \
// going to be subtle and "smooth". } \
// hsFastMath::NormalizeAppr(destNorm); /* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".*/\
#ifdef MF_RECALC_BOUNDS /* hsFastMath::NormalizeAppr(destNorm);*/ \
inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); \
#endif // MF_RECALC_BOUNDS /* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
// Slam data into position now dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffPoint( dest, destPt ); dest = inlStuffUInt32( dest, color ); \
dest = inlStuffPoint( dest, destNorm ); dest = inlStuffUInt32( dest, specColor ); \
dest = inlStuffUInt32( dest, color ); memcpy( dest, src, uvChanSize ); \
dest = inlStuffUInt32( dest, specColor ); src += uvChanSize; \
memcpy( dest, src, uvChanSize ); dest += uvChanSize; \
src += uvChanSize; } \
dest += uvChanSize; } \
} else \
} { \
else uint8_t hiChan = localUVWChans >> 8; \
{ uint8_t loChan = localUVWChans & 0xff; \
uint8_t hiChan = localUVWChans >> 8; /* Copy whilst blending */\
uint8_t loChan = localUVWChans & 0xff; for( i = 0; i < count; i++ ) \
/// Copy whilst blending { \
for( i = 0; i < count; i++ ) hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
{ hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; /* Extract data */\
src = inlExtractPoint( src, pt ); \
// Extract data for( j = 0, weightSum = 0; j < numWeights; j++ ) \
src = inlExtractPoint( src, pt ); { \
for( j = 0, weightSum = 0; j < numWeights; j++ ) src = inlExtractFloat( src, weights[ j ] ); \
{ weightSum += weights[ j ]; \
src = inlExtractFloat( src, weights[ j ] ); } \
weightSum += weights[ j ]; weights[ j ] = 1 - weightSum; \
} \
weights[ j ] = 1 - weightSum; if( format & plGBufferGroup::kSkinIndices ) \
{ \
if( format & plGBufferGroup::kSkinIndices ) src = inlExtractUInt32( src, indices ); \
{ } \
src = inlExtractUInt32( src, indices ); else \
} { \
else indices = 1 << 8; \
{ } \
indices = 1 << 8; \
} src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractPoint( src, vec ); src = inlExtractUInt32( src, specColor ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); uint8_t k; \
for( k = 0; k < numUVs; k++ ) \
uint8_t k; { \
for( k = 0; k < numUVs; k++ ) src = inlExtractPoint( src, srcUVWs[k] ); \
{ } \
src = inlExtractPoint( src, srcUVWs[k] ); memcpy( dstUVWs, srcUVWs, uvChanSize); \
} dstUVWs[loChan].Set(0,0,0); \
memcpy( dstUVWs, srcUVWs, uvChanSize); dstUVWs[hiChan].Set(0,0,0); \
dstUVWs[loChan].Set(0,0,0); \
dstUVWs[hiChan].Set(0,0,0); /* Blend */\
destPt.Set( 0, 0, 0 ); \
// Blend destNorm.Set( 0, 0, 0 ); \
destPt.Set( 0, 0, 0 ); for( j = 0; j < numWeights + 1; j++ ) \
destNorm.Set( 0, 0, 0 ); { \
for( j = 0; j < numWeights + 1; j++ ) if( weights[ j ] ) \
{ { \
if( weights[ j ] ) /*
{
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]); MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt); MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec); MATRIXMULTVECTORADD(destNorm, vec);
MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]); MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]); MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
*/
#define BLENDVERTEND \
} \
\
indices >>= 8; \
} \
/* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth". */\
/* hsFastMath::NormalizeAppr(destNorm); */\
/* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
/* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
\
/* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffUInt32( dest, color ); \
dest = inlStuffUInt32( dest, specColor ); \
memcpy( dest, dstUVWs, uvChanSize ); \
dest += uvChanSize; \
} \
} }
indices >>= 8; void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
} hsMatrix44* matrixPalette, int numMatrices,
// Probably don't really need to renormalize this. There errors are const uint8_t *src, uint8_t format, uint32_t srcStride,
// going to be subtle and "smooth". uint8_t *dest, uint32_t destStride, uint32_t count,
// hsFastMath::NormalizeAppr(destNorm); uint16_t localUVWChans )
// hsFastMath::NormalizeAppr(dstUVWs[loChan]); {
// hsFastMath::NormalizeAppr(dstUVWs[hiChan]); BLENDVERTSTART
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
#ifdef MF_RECALC_BOUNDS MATRIXMULTPOINTADD_FPU(destPt, pt);
inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ); MATRIXMULTVECTORADD_FPU(destNorm, vec);
#endif // MF_RECALC_BOUNDS BLENDVERTMID
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
// Slam data into position now MATRIXMULTPOINTADD_FPU(destPt, pt);
dest = inlStuffPoint( dest, destPt ); MATRIXMULTVECTORADD_FPU(destNorm, vec);
dest = inlStuffPoint( dest, destNorm ); MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
dest = inlStuffUInt32( dest, color ); MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
dest = inlStuffUInt32( dest, specColor );
memcpy( dest, dstUVWs, uvChanSize ); BLENDVERTEND
dest += uvChanSize; }
}
} void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
#ifdef MF_RECALC_BOUNDS hsMatrix44* matrixPalette, int numMatrices,
hsBounds3Ext wBnd; const uint8_t *src, uint8_t format, uint32_t srcStride,
wBnd.Reset(&hsPoint3(minX, minY, minZ)); uint8_t *dest, uint32_t destStride, uint32_t count,
wBnd.Union(&hsPoint3(maxX, maxY, maxZ)); uint16_t localUVWChans )
span->fWorldBounds = wBnd; {
#endif // MF_RECALC_BOUNDS #ifdef HS_SSE3
BLENDVERTSTART
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_SSE3(destPt, pt);
MATRIXMULTVECTORADD_SSE3(destNorm, vec);
BLENDVERTMID
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_SSE3(destPt, pt);
MATRIXMULTVECTORADD_SSE3(destNorm, vec);
MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND
#endif // HS_SSE3
} }
// ISetPipeConsts ////////////////////////////////////////////////////////////////// // ISetPipeConsts //////////////////////////////////////////////////////////////////

10
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

@ -465,7 +465,8 @@ protected:
void IBlendVertsIntoBuffer( plSpan* span, void IBlendVertsIntoBuffer( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices, hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride, const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans ); uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans )
{ blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); };
hsBool ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList ); hsBool ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList );
@ -798,6 +799,13 @@ public:
virtual int GetMaxAnisotropicSamples(); virtual int GetMaxAnisotropicSamples();
virtual int GetMaxAntiAlias(int Width, int Height, int ColorDepth); virtual int GetMaxAntiAlias(int Width, int Height, int ColorDepth);
// CPU-optimized functions
protected:
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
}; };

Loading…
Cancel
Save