Browse Source

Merge pull request #202 from Deledrius/hsCpuID

Add CPU feature detection and function dispatcher
Branan Purvine-Riley 13 years ago
parent
commit
bde67949ac
  1. 6
      CMakeLists.txt
  2. 28
      Sources/Plasma/Apps/plClient/winmain.cpp
  3. 2
      Sources/Plasma/CoreLib/CMakeLists.txt
  4. 71
      Sources/Plasma/CoreLib/hsCpuID.cpp
  5. 182
      Sources/Plasma/CoreLib/hsCpuID.h
  6. 85
      Sources/Plasma/CoreLib/hsMatrix44.cpp
  7. 9
      Sources/Plasma/CoreLib/hsMatrix44.h
  8. 7
      Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
  9. 8
      Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
  10. 491
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
  11. 10
      Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

6
CMakeLists.txt

@ -84,12 +84,6 @@ if(MSVC)
add_definitions(-D_SCL_SECURE_NO_WARNINGS)
endif(MSVC)
# TODO: Maybe some kind of automated test here?
option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
if(PLASMA_USE_SSE)
add_definitions(-DHAVE_SSE)
endif(PLASMA_USE_SSE)
#TODO: Make the OpenSSL includes less promiscuous so this isn't needed
include_directories(${OPENSSL_INCLUDE_DIR})

28
Sources/Plasma/Apps/plClient/winmain.cpp

@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <dmdfm.h> // Windows Load EXE into memory suff
#endif
#ifdef HAVE_SSE
# include <intrin.h>
#endif
#include <curl/curl.h>
#include "HeadSpin.h"
@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
}
#endif
bool CheckCPU()
{
const unsigned int sse3_flag = 0x00000001;
// (any other CPU features...)
int cpu_info[4];
__cpuid(cpu_info, 1);
#ifdef HAVE_SSE
if((cpu_info[2] & sse3_flag) == 0)
return false;
#endif
// Insert additional feature checks here
return true;
}
#include "pfConsoleCore/pfConsoleEngine.h"
PF_CONSOLE_LINK_ALL()
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
{
// Check to make sure we have a good CPU before getting started
if (!CheckCPU())
{
plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName());
hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
return PARABLE_NORMAL_EXIT;
}
PF_CONSOLE_INIT_ALL()
// Set global handle

2
Sources/Plasma/CoreLib/CMakeLists.txt

@ -15,6 +15,7 @@ set(CoreLib_SOURCES
HeadSpin.cpp
hsBitVector.cpp
hsBounds.cpp
hsCpuID.cpp
hsCritSect.cpp
hsExceptionStack.cpp
hsFastMath.cpp
@ -57,6 +58,7 @@ set(CoreLib_HEADERS
hsBitVector.h
hsBounds.h
hsColorRGBA.h
hsCpuID.h
hsCritSect.h
hsExceptions.h
hsFastMath.h

71
Sources/Plasma/CoreLib/hsCpuID.cpp

@ -0,0 +1,71 @@
/*==LICENSE==*
CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011 Cyan Worlds, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional permissions under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.
You can contact Cyan Worlds, Inc. by email legal@cyan.com
or by snail mail at:
Cyan Worlds, Inc.
14617 N Newport Hwy
Mead, WA 99021
*==LICENSE==*/
#include <intrin.h>
#include "hsCpuID.h"
hsCpuId::hsCpuId() {
const unsigned int sse1_flag = 1<<25;
const unsigned int sse2_flag = 1<<26;
const unsigned int sse3_flag = 1<<0;
const unsigned int ssse3_flag = 1<<9;
const unsigned int sse41_flag = 1<<19;
const unsigned int sse42_flag = 1<<20;
const unsigned int avx_flag = 1 << 28;
unsigned int cpu_info[4];
__cpuid((int*)cpu_info, 1);
has_sse1 = (cpu_info[3] & sse1_flag) || false;
has_sse2 = (cpu_info[3] & sse2_flag) || false;
has_sse3 = (cpu_info[2] & sse3_flag) || false;
has_ssse3 = (cpu_info[2] & ssse3_flag) || false;
has_sse41 = (cpu_info[2] & sse41_flag) || false;
has_sse42 = (cpu_info[2] & sse42_flag) || false;
has_avx = (cpu_info[2] & avx_flag) || false;
}
const hsCpuId& hsCpuId::instance()
{
static hsCpuId self;
return self;
}

182
Sources/Plasma/CoreLib/hsCpuID.h

@ -0,0 +1,182 @@
/*==LICENSE==*
CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011 Cyan Worlds, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional permissions under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.
You can contact Cyan Worlds, Inc. by email legal@cyan.com
or by snail mail at:
Cyan Worlds, Inc.
14617 N Newport Hwy
Mead, WA 99021
*==LICENSE==*/
//////////////////////////////////////////////////////////////////////
//
// hsCpuID - Processor feature detection and function dispatcher
//
//
// == Example Usage ==
//
// #ifdef HS_SIMD_INCLUDE
// # include HS_SIMD_INCLUDE
// #endif
//
// float my_func_fpu() {
// ...
// }
//
// float my_func_avx() {
// #ifdef HS_AVX
// ...
// #endif
// }
//
//
// typedef float(*func_ptr)();
// static hsFunctionDispatcher<func_ptr> my_func;
//
// hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
//
//////////////////////////////////////////////////////////////////////
#ifndef hsCpuID_inc
#define hsCpuID_inc
#if defined __AVX__ || _MSC_VER >= 1600
#define HS_AVX
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "immintrin.h"
#endif
#endif
#if defined __SSE4_2__ || _MSC_VER >= 1600
#define HS_SSE42
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "nmmintrin.h"
#endif
#endif
#if defined __SSE4_1__ || _MSC_VER >= 1600
#define HS_SSE41
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "smmintrin.h"
#endif
#endif
#if defined __SSSE3__ || _MSC_VER >= 1600
#define HS_SSSE3
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "tmmintrin.h"
#endif
#endif
#if defined __SSE3__ || _MSC_VER >= 1400
#define HS_SSE3
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "pmmintrin.h"
#endif
#endif
#if defined __SSE2__ || _MSC_VER >= 1300
#define HS_SSE2
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "emmintrin.h"
#endif
#endif
#if defined __SSE__ || _MSC_VER >= 1300
#define HS_SSE1
#ifndef HS_SIMD_INCLUDE
# define HS_SIMD_INCLUDE "xmmintrin.h"
#endif
#endif
struct hsCpuId {
bool has_sse1;
bool has_sse2;
bool has_sse3;
bool has_ssse3;
bool has_sse41;
bool has_sse42;
bool has_avx;
hsCpuId();
static const hsCpuId& instance();
};
template <typename func_ptr>
struct hsFunctionDispatcher {
hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
hsAssert(fpu, "FPU fallback function required.");
const hsCpuId& cpu = hsCpuId::instance();
#ifdef HS_AVX
if (cpu.has_avx && avx) {
call = avx;
} else
#endif
#ifdef HS_SSE42
if (cpu.has_sse42 && sse42) {
call = sse42;
} else
#endif
#ifdef HS_SSE41
if (cpu.has_sse41 && sse41) {
call = sse41;
} else
#endif
#ifdef HS_SSSE3
if (cpu.has_ssse3 && ssse3) {
call = ssse3;
} else
#endif
#ifdef HS_SSE3
if (cpu.has_sse3 && sse3) {
call = sse3;
} else
#endif
#ifdef HS_SSE2
if (cpu.has_sse2 && sse2) {
call = sse2;
} else
#endif
#ifdef HS_SSE1
if (cpu.has_sse1 && sse1) {
call = sse1;
} else
#endif
{
call = fpu;
}
};
func_ptr call;
};
#endif // hsCpuID_inc

85
Sources/Plasma/CoreLib/hsMatrix44.cpp

@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "hsStream.h"
#include <math.h>
#ifdef HAVE_SSE
# include <smmintrin.h>
#ifdef HS_SIMD_INCLUDE
# include HS_SIMD_INCLUDE
#endif
static hsMatrix44 myIdent = hsMatrix44().Reset();
const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
/*
For the rotation:
¦ 2 2 ¦
@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
rotate.QuatFromMatrix44(*this);
}
#ifdef HAVE_SSE
hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
{
hsMatrix44 c;
if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
{
c.Reset();
return c;
}
if( a.fFlags & hsMatrix44::kIsIdent )
return b;
if( b.fFlags & hsMatrix44::kIsIdent )
return a;
c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]);
c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]);
c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]);
c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]);
c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]);
c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]);
c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]);
c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]);
c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]);
c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]);
c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]);
c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]);
c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]);
c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]);
c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]);
c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]);
return c;
}
#ifdef HS_SSE3
# define MULTBEGIN(i) \
xmm[0] = _mm_loadu_ps(fMap[i]);
xmm[0] = _mm_loadu_ps(a.fMap[i]);
# define MULTCELL(i, j) \
xmm[1] = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \
xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]);
@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \
xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \
_mm_storeu_ps(c.fMap[i], xmm[1]);
#endif
#endif // HS_SSE3
hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
{
hsMatrix44 c;
if( fFlags & b.fFlags & hsMatrix44::kIsIdent )
#ifdef HS_SSE3
if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
{
c.Reset();
return c;
}
if( fFlags & hsMatrix44::kIsIdent )
if( a.fFlags & hsMatrix44::kIsIdent )
return b;
if( b.fFlags & hsMatrix44::kIsIdent )
return *this;
return a;
#ifdef HAVE_SSE
__m128 xmm[8];
MULTBEGIN(0);
@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
MULTCELL(3, 2);
MULTCELL(3, 3);
MULTFINISH(3);
#else
c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]);
c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]);
c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]);
c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]);
c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]);
c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]);
c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]);
c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]);
c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]);
c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]);
c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]);
c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]);
c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]);
c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]);
c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]);
c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]);
#endif
#endif // HS_SSE3
return c;
}

9
Sources/Plasma/CoreLib/hsMatrix44.h

@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include "HeadSpin.h"
#include "hsGeometry3.h"
#include "hsCpuID.h"
class hsQuat;
@ -140,7 +141,7 @@ struct hsMatrix44 {
return rVal;
}
hsVector3 operator*(const hsVector3& p) const;
hsMatrix44 operator*(const hsMatrix44& b) const;
hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
hsPoint3* MapPoints(long count, hsPoint3 points[]) const;
@ -152,6 +153,12 @@ struct hsMatrix44 {
void Read(hsStream *stream);
void Write(hsStream *stream);
// CPU-optimized functions
typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
};
////////////////////////////////////////////////////////////////////////////

7
Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp

@ -380,7 +380,6 @@ plProfile_CreateTimer(" CIRecalcT", "Object", CIRecalcT);
plProfile_CreateTimer(" CIDirtyT", "Object", CIDirtyT);
plProfile_CreateTimer(" CISetT", "Object", CISetT);
#ifndef HAVE_SSE
static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
{
hsMatrix44 ret;
@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
return ret;
}
#endif // HAVE_SSE
void plCoordinateInterface::IRecalcTransforms()
{
@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms()
plProfile_BeginTiming(CIRecalcT);
if( fParent )
{
#ifdef HAVE_SSE
fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent;
fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal();
#else
fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent);
fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal());
#endif
}
else
{

8
Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp

@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const
}
//// SetTransform ////////////////////////////////////////////////////////////
#ifndef HAVE_SSE
static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
{
hsMatrix44 ret;
@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
return ret;
}
#endif
#ifdef MF_TEST_UPDATE
plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans);
@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w
#endif // MF_TEST_UPDATE
for( i = 0; i < spans->GetCount(); i++ )
{
#ifdef HAVE_SSE
fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ];
fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l;
#else
fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]);
fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l);
#endif // HAVE_SSE
}
#ifdef MF_TEST_UPDATE
plProfile_EndTiming(DSMatTransT);

491
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp

@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
#include <algorithm>
#ifdef HAVE_SSE
# include <smmintrin.h>
#ifdef HS_SIMD_INCLUDE
# include HS_SIMD_INCLUDE
#endif
//#define MF_TOSSER
@ -10525,48 +10525,35 @@ void plDXPipeline::LoadResources()
plNetClientApp::StaticDebugMsg("End Device Reload");
}
// Sorry about this, but it really did speed up the skinning.
// Just some macros for the inner loop of IBlendVertsIntoBuffer.
#ifdef HAVE_SSE
# define MATRIXMULTBEGIN(xfm, wgt) \
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
ALIGN(16) float hack[4]; \
mc0 = _mm_loadu_ps(xfm.fMap[0]); \
mc1 = _mm_loadu_ps(xfm.fMap[1]); \
mc2 = _mm_loadu_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt);
# define MATRIXMULTPOINTADD(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf = _mm_hadd_ps(_x, _y); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fZ += hack[0];
# define MATRIXMULTVECTORADD(dst, src) \
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf = _mm_hadd_ps(_x, _y); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
hbuf = _mm_hadd_ps(_z, _z); \
hbuf = _mm_hadd_ps(hbuf, hbuf); \
_mm_store_ps(hack, hbuf); \
dst.fZ += hack[0];
#else
# define MATRIXMULTBEGIN(xfm, wgt) \
// inlTESTPOINT /////////////////////////////////////////
// Update mins and maxs if destP is outside.
inline void inlTESTPOINT(const hsPoint3& destP,
float& minX, float& minY, float& minZ,
float& maxX, float& maxY, float& maxZ)
{
if( destP.fX < minX )
minX = destP.fX;
else if( destP.fX > maxX )
maxX = destP.fX;
if( destP.fY < minY )
minY = destP.fY;
else if( destP.fY > maxY )
maxY = destP.fY;
if( destP.fZ < minZ )
minZ = destP.fZ;
else if( destP.fZ > maxZ )
maxZ = destP.fZ;
}
//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
// Given a pointer into a buffer of verts that have blending data in the D3D
// format, blends them into the destination buffer given without the blending
// info.
// FPU version
#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
float m00 = xfm.fMap[0][0]; \
float m01 = xfm.fMap[0][1]; \
float m02 = xfm.fMap[0][2]; \
@ -10581,7 +10568,7 @@ void plDXPipeline::LoadResources()
float m23 = xfm.fMap[2][3]; \
float m_wgt = wgt; \
float srcX, srcY, srcZ;
# define MATRIXMULTPOINTADD(dst, src) \
#define MATRIXMULTPOINTADD_FPU(dst, src) \
srcX = src.fX; \
srcY = src.fY; \
srcZ = src.fZ; \
@ -10589,7 +10576,7 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
# define MATRIXMULTVECTORADD(dst, src) \
#define MATRIXMULTVECTORADD_FPU(dst, src) \
srcX = src.fX; \
srcY = src.fY; \
srcZ = src.fZ; \
@ -10597,218 +10584,250 @@ void plDXPipeline::LoadResources()
dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
#endif // HAVE_SSE
// inlTESTPOINT /////////////////////////////////////////
// Update mins and maxs if destP is outside.
inline void inlTESTPOINT(const hsPoint3& destP,
float& minX, float& minY, float& minZ,
float& maxX, float& maxY, float& maxZ)
{
if( destP.fX < minX )
minX = destP.fX;
else if( destP.fX > maxX )
maxX = destP.fX;
if( destP.fY < minY )
minY = destP.fY;
else if( destP.fY > maxY )
maxY = destP.fY;
if( destP.fZ < minZ )
minZ = destP.fZ;
else if( destP.fZ > maxZ )
maxZ = destP.fZ;
}
//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
// Given a pointer into a buffer of verts that have blending data in the D3D
// format, blends them into the destination buffer given without the blending
// info.
void plDXPipeline::IBlendVertsIntoBuffer( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans )
{
uint8_t numUVs, numWeights;
uint32_t i, j, indices, color, specColor, uvChanSize;
float weights[ 4 ], weightSum;
hsPoint3 pt, tempPt, destPt;
hsVector3 vec, tempNorm, destNorm;
/// Get some counts
switch( format & plGBufferGroup::kSkinWeightMask )
{
case plGBufferGroup::kSkin1Weight: numWeights = 1; break;
case plGBufferGroup::kSkin2Weights: numWeights = 2; break;
case plGBufferGroup::kSkin3Weights: numWeights = 3; break;
default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" );
}
numUVs = plGBufferGroup::CalcNumUVs( format );
uvChanSize = numUVs * sizeof( float ) * 3;
//#define MF_RECALC_BOUNDS
#ifdef MF_RECALC_BOUNDS
float minX = 1.e33f;
float minY = 1.e33f;
float minZ = 1.e33f;
// SSE3 version
#ifdef HS_SSE3
#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
__m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
ALIGN(16) float hack[4]; \
mc0 = _mm_loadu_ps(xfm.fMap[0]); \
mc1 = _mm_loadu_ps(xfm.fMap[1]); \
mc2 = _mm_loadu_ps(xfm.fMap[2]); \
mwt = _mm_set_ps1(wgt);
#define MATRIXMULTPOINTADD_SSE3(dst, src) \
msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
_mm_store_ps(hack, hbuf1); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
dst.fZ += hack[2];
#define MATRIXMULTVECTORADD_SSE3(dst, src) \
msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
_x = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
_y = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
_z = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
\
hbuf1 = _mm_hadd_ps(_x, _y); \
hbuf2 = _mm_hadd_ps(_z, _z); \
hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
_mm_store_ps(hack, hbuf1); \
dst.fX += hack[0]; \
dst.fY += hack[1]; \
dst.fZ += hack[2];
#endif
float maxX = -1.e33f;
float maxY = -1.e33f;
float maxZ = -1.e33f;
#endif // MF_RECALC_BOUNDS
// CPU-optimized functions requiring dispatch
hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
// localUVWChans is bump mapping tangent space vectors, which need to
// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
#define BLENDVERTSTART \
uint8_t numUVs, numWeights; \
uint32_t i, j, indices, color, specColor, uvChanSize; \
float weights[ 4 ], weightSum; \
hsPoint3 pt, tempPt, destPt; \
hsVector3 vec, tempNorm, destNorm; \
\
/* Get some counts */\
switch( format & plGBufferGroup::kSkinWeightMask ) \
{ \
case plGBufferGroup::kSkin1Weight: numWeights = 1; break; \
case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
} \
\
numUVs = plGBufferGroup::CalcNumUVs( format ); \
uvChanSize = numUVs * sizeof( float ) * 3; \
\
/* localUVWChans is bump mapping tangent space vectors, which need to
// be skinned like the normal, as opposed to passed through like
// garden variety UVW coordinates.
// There are no localUVWChans that I know of in production assets (i.e.
// the avatar is not skinned).
if( !localUVWChans )
{
/// Copy whilst blending
for( i = 0; i < count; i++ )
{
// Extract data
src = inlExtractPoint( src, pt );
for( j = 0, weightSum = 0; j < numWeights; j++ )
{
src = inlExtractFloat( src, weights[ j ] );
weightSum += weights[ j ];
}
weights[ j ] = 1 - weightSum;
if( format & plGBufferGroup::kSkinIndices )
{
src = inlExtractUInt32( src, indices );
}
else
{
indices = 1 << 8;
}
src = inlExtractPoint( src, vec );
src = inlExtractUInt32( src, color );
src = inlExtractUInt32( src, specColor );
// Blend
destPt.Set( 0, 0, 0 );
destNorm.Set( 0, 0, 0 );
for( j = 0; j < numWeights + 1; j++ )
{
if( weights[ j ] )
// the avatar is not skinned).*/\
if( !localUVWChans ) \
{ \
/* Copy whilst blending */\
for( i = 0; i < count; i++ ) \
{ \
/* Extract data */\
src = inlExtractPoint( src, pt ); \
for( j = 0, weightSum = 0; j < numWeights; j++ ) \
{ \
src = inlExtractFloat( src, weights[ j ] ); \
weightSum += weights[ j ]; \
} \
weights[ j ] = 1 - weightSum; \
\
if( format & plGBufferGroup::kSkinIndices ) \
{ \
src = inlExtractUInt32( src, indices ); \
} \
else \
{ \
indices = 1 << 8; \
} \
src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); \
\
/* Blend */\
destPt.Set( 0, 0, 0 ); \
destNorm.Set( 0, 0, 0 ); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
{
/*
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec);
}
indices >>= 8;
}
// Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".
// hsFastMath::NormalizeAppr(destNorm);
#ifdef MF_RECALC_BOUNDS
inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
#endif // MF_RECALC_BOUNDS
// Slam data into position now
dest = inlStuffPoint( dest, destPt );
dest = inlStuffPoint( dest, destNorm );
dest = inlStuffUInt32( dest, color );
dest = inlStuffUInt32( dest, specColor );
memcpy( dest, src, uvChanSize );
src += uvChanSize;
dest += uvChanSize;
}
}
else
{
uint8_t hiChan = localUVWChans >> 8;
uint8_t loChan = localUVWChans & 0xff;
/// Copy whilst blending
for( i = 0; i < count; i++ )
{
hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels];
hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels];
// Extract data
src = inlExtractPoint( src, pt );
for( j = 0, weightSum = 0; j < numWeights; j++ )
{
src = inlExtractFloat( src, weights[ j ] );
weightSum += weights[ j ];
}
weights[ j ] = 1 - weightSum;
if( format & plGBufferGroup::kSkinIndices )
{
src = inlExtractUInt32( src, indices );
}
else
{
indices = 1 << 8;
}
src = inlExtractPoint( src, vec );
src = inlExtractUInt32( src, color );
src = inlExtractUInt32( src, specColor );
uint8_t k;
for( k = 0; k < numUVs; k++ )
{
src = inlExtractPoint( src, srcUVWs[k] );
}
memcpy( dstUVWs, srcUVWs, uvChanSize);
dstUVWs[loChan].Set(0,0,0);
dstUVWs[hiChan].Set(0,0,0);
// Blend
destPt.Set( 0, 0, 0 );
destNorm.Set( 0, 0, 0 );
for( j = 0; j < numWeights + 1; j++ )
{
if( weights[ j ] )
{
*/
#define BLENDVERTMID \
} \
\
indices >>= 8; \
} \
/* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".*/\
/* hsFastMath::NormalizeAppr(destNorm);*/ \
\
/* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffUInt32( dest, color ); \
dest = inlStuffUInt32( dest, specColor ); \
memcpy( dest, src, uvChanSize ); \
src += uvChanSize; \
dest += uvChanSize; \
} \
} \
else \
{ \
uint8_t hiChan = localUVWChans >> 8; \
uint8_t loChan = localUVWChans & 0xff; \
/* Copy whilst blending */\
for( i = 0; i < count; i++ ) \
{ \
hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
\
/* Extract data */\
src = inlExtractPoint( src, pt ); \
for( j = 0, weightSum = 0; j < numWeights; j++ ) \
{ \
src = inlExtractFloat( src, weights[ j ] ); \
weightSum += weights[ j ]; \
} \
weights[ j ] = 1 - weightSum; \
\
if( format & plGBufferGroup::kSkinIndices ) \
{ \
src = inlExtractUInt32( src, indices ); \
} \
else \
{ \
indices = 1 << 8; \
} \
\
src = inlExtractPoint( src, vec ); \
src = inlExtractUInt32( src, color ); \
src = inlExtractUInt32( src, specColor ); \
\
uint8_t k; \
for( k = 0; k < numUVs; k++ ) \
{ \
src = inlExtractPoint( src, srcUVWs[k] ); \
} \
memcpy( dstUVWs, srcUVWs, uvChanSize); \
dstUVWs[loChan].Set(0,0,0); \
dstUVWs[hiChan].Set(0,0,0); \
\
/* Blend */\
destPt.Set( 0, 0, 0 ); \
destNorm.Set( 0, 0, 0 ); \
for( j = 0; j < numWeights + 1; j++ ) \
{ \
if( weights[ j ] ) \
{ \
/*
MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD(destPt, pt);
MATRIXMULTVECTORADD(destNorm, vec);
MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
*/
#define BLENDVERTEND \
} \
\
indices >>= 8; \
} \
/* Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth". */\
/* hsFastMath::NormalizeAppr(destNorm); */\
/* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
/* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
\
/* Slam data into position now */\
dest = inlStuffPoint( dest, destPt ); \
dest = inlStuffPoint( dest, destNorm ); \
dest = inlStuffUInt32( dest, color ); \
dest = inlStuffUInt32( dest, specColor ); \
memcpy( dest, dstUVWs, uvChanSize ); \
dest += uvChanSize; \
} \
}
indices >>= 8;
}
// Probably don't really need to renormalize this. There errors are
// going to be subtle and "smooth".
// hsFastMath::NormalizeAppr(destNorm);
// hsFastMath::NormalizeAppr(dstUVWs[loChan]);
// hsFastMath::NormalizeAppr(dstUVWs[hiChan]);
void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans )
{
BLENDVERTSTART
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
#ifdef MF_RECALC_BOUNDS
inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
#endif // MF_RECALC_BOUNDS
MATRIXMULTPOINTADD_FPU(destPt, pt);
MATRIXMULTVECTORADD_FPU(destNorm, vec);
BLENDVERTMID
MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
// Slam data into position now
dest = inlStuffPoint( dest, destPt );
dest = inlStuffPoint( dest, destNorm );
dest = inlStuffUInt32( dest, color );
dest = inlStuffUInt32( dest, specColor );
memcpy( dest, dstUVWs, uvChanSize );
dest += uvChanSize;
}
MATRIXMULTPOINTADD_FPU(destPt, pt);
MATRIXMULTVECTORADD_FPU(destNorm, vec);
MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND
}
#ifdef MF_RECALC_BOUNDS
hsBounds3Ext wBnd;
wBnd.Reset(&hsPoint3(minX, minY, minZ));
wBnd.Union(&hsPoint3(maxX, maxY, maxZ));
span->fWorldBounds = wBnd;
#endif // MF_RECALC_BOUNDS
void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count,
uint16_t localUVWChans )
{
#ifdef HS_SSE3
BLENDVERTSTART
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_SSE3(destPt, pt);
MATRIXMULTVECTORADD_SSE3(destNorm, vec);
BLENDVERTMID
MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
MATRIXMULTPOINTADD_SSE3(destPt, pt);
MATRIXMULTVECTORADD_SSE3(destNorm, vec);
MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
BLENDVERTEND
#endif // HS_SSE3
}
// ISetPipeConsts //////////////////////////////////////////////////////////////////

10
Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h

@ -465,7 +465,8 @@ protected:
void IBlendVertsIntoBuffer( plSpan* span,
hsMatrix44* matrixPalette, int numMatrices,
const uint8_t *src, uint8_t format, uint32_t srcStride,
uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans );
uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans )
{ blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); };
hsBool ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList );
@ -798,6 +799,13 @@ public:
virtual int GetMaxAnisotropicSamples();
virtual int GetMaxAntiAlias(int Width, int Height, int ColorDepth);
// CPU-optimized functions
protected:
typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
};

Loading…
Cancel
Save