Merge pull request #202 from Deledrius/hsCpuID

Add CPU feature detection and function dispatcher
13 years ago · bde67949ac
11 changed files with 588 additions and 327 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -84,12 +84,6 @@ if(MSVC)
    add_definitions(-D_SCL_SECURE_NO_WARNINGS)
 endif(MSVC)
 # TODO: Maybe some kind of automated test here?
 option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
 if(PLASMA_USE_SSE)
    add_definitions(-DHAVE_SSE)
 endif(PLASMA_USE_SSE)
 #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
 include_directories(${OPENSSL_INCLUDE_DIR})
--- a/Sources/Plasma/Apps/plClient/winmain.cpp
+++ b/Sources/Plasma/Apps/plClient/winmain.cpp
@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
    #include <dmdfm.h>      // Windows Load EXE into memory suff
 #endif
 #ifdef HAVE_SSE
 #   include <intrin.h>
 #endif
 #include <curl/curl.h>
 #include "HeadSpin.h"
@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
 }
 #endif
 bool CheckCPU()
 {
    const unsigned int sse3_flag = 0x00000001;
    // (any other CPU features...)
    int cpu_info[4];
    __cpuid(cpu_info, 1);
 #ifdef HAVE_SSE
    if((cpu_info[2] & sse3_flag) == 0)
        return false;
 #endif
    // Insert additional feature checks here
    return true;
 }
 #include "pfConsoleCore/pfConsoleEngine.h"
 PF_CONSOLE_LINK_ALL()
 int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
 {
    // Check to make sure we have a good CPU before getting started
    if (!CheckCPU())
    {
        plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName());
        hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
        return PARABLE_NORMAL_EXIT;
    }
    PF_CONSOLE_INIT_ALL()
    // Set global handle
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@ -15,6 +15,7 @@ set(CoreLib_SOURCES
    HeadSpin.cpp
    hsBitVector.cpp
    hsBounds.cpp
    hsCpuID.cpp
    hsCritSect.cpp
    hsExceptionStack.cpp
    hsFastMath.cpp
@ -57,6 +58,7 @@ set(CoreLib_HEADERS
    hsBitVector.h
    hsBounds.h
    hsColorRGBA.h
    hsCpuID.h
    hsCritSect.h
    hsExceptions.h
    hsFastMath.h
--- a/Sources/Plasma/CoreLib/hsCpuID.cpp
+++ b/Sources/Plasma/CoreLib/hsCpuID.cpp
@ -0,0 +1,71 @@
 /*==LICENSE==*
 CyanWorlds.com Engine - MMOG client, server and tools
 Copyright (C) 2011  Cyan Worlds, Inc.
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 Additional permissions under GNU GPL version 3 section 7
 If you modify this Program, or any covered work, by linking or
 combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
 NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
 JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
 (or a modified version of those libraries),
 containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
 PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
 JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
 licensors of this Program grant you additional
 permission to convey the resulting work. Corresponding Source for a
 non-source form of such a combination shall include the source code for
 the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
 work.
 You can contact Cyan Worlds, Inc. by email legal@cyan.com
 or by snail mail at:
      Cyan Worlds, Inc.
      14617 N Newport Hwy
      Mead, WA   99021
 *==LICENSE==*/
 #include <intrin.h>
 #include "hsCpuID.h"
 hsCpuId::hsCpuId() {
    const unsigned int sse1_flag = 1<<25;
    const unsigned int sse2_flag = 1<<26;
    const unsigned int sse3_flag = 1<<0;
    const unsigned int ssse3_flag = 1<<9;
    const unsigned int sse41_flag = 1<<19;
    const unsigned int sse42_flag = 1<<20;
    const unsigned int avx_flag = 1 << 28;
    unsigned int cpu_info[4];
    __cpuid((int*)cpu_info, 1);
    has_sse1 = (cpu_info[3] & sse1_flag) || false;
    has_sse2 = (cpu_info[3] & sse2_flag) || false;
    has_sse3 = (cpu_info[2] & sse3_flag) || false;
    has_ssse3 = (cpu_info[2] & ssse3_flag) || false;
    has_sse41 = (cpu_info[2] & sse41_flag) || false;
    has_sse42 = (cpu_info[2] & sse42_flag) || false;
    has_avx = (cpu_info[2] & avx_flag) || false;
 }
 const hsCpuId& hsCpuId::instance()
 {
    static hsCpuId self;
    return self;
 }
--- a/Sources/Plasma/CoreLib/hsCpuID.h
+++ b/Sources/Plasma/CoreLib/hsCpuID.h
@ -0,0 +1,182 @@
 /*==LICENSE==*
 CyanWorlds.com Engine - MMOG client, server and tools
 Copyright (C) 2011  Cyan Worlds, Inc.
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 Additional permissions under GNU GPL version 3 section 7
 If you modify this Program, or any covered work, by linking or
 combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
 NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
 JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
 (or a modified version of those libraries),
 containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
 PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
 JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
 licensors of this Program grant you additional
 permission to convey the resulting work. Corresponding Source for a
 non-source form of such a combination shall include the source code for
 the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
 work.
 You can contact Cyan Worlds, Inc. by email legal@cyan.com
 or by snail mail at:
      Cyan Worlds, Inc.
      14617 N Newport Hwy
      Mead, WA   99021
 *==LICENSE==*/
 //////////////////////////////////////////////////////////////////////
 //
 // hsCpuID - Processor feature detection and function dispatcher
 //
 //
 //     == Example Usage ==
 //
 //  #ifdef HS_SIMD_INCLUDE
 //  #   include HS_SIMD_INCLUDE
 //  #endif
 //
 //  float my_func_fpu() {
 //    ...
 //  }
 //
 //  float my_func_avx() {
 //  #ifdef HS_AVX
 //    ...
 //  #endif
 //  }
 //
 //
 //  typedef float(*func_ptr)();
 //  static hsFunctionDispatcher<func_ptr> my_func;
 //
 //  hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
 //
 //////////////////////////////////////////////////////////////////////
 #ifndef hsCpuID_inc
 #define hsCpuID_inc
 #if defined __AVX__ || _MSC_VER >= 1600
 #define HS_AVX
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "immintrin.h"
 #endif
 #endif
 #if defined __SSE4_2__ || _MSC_VER >= 1600
 #define HS_SSE42
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "nmmintrin.h"
 #endif
 #endif
 #if defined __SSE4_1__ || _MSC_VER >= 1600
 #define HS_SSE41
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "smmintrin.h"
 #endif
 #endif
 #if defined __SSSE3__ || _MSC_VER >= 1600
 #define HS_SSSE3
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "tmmintrin.h"
 #endif
 #endif
 #if defined __SSE3__ || _MSC_VER >= 1400
 #define HS_SSE3
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "pmmintrin.h"
 #endif
 #endif
 #if defined __SSE2__ || _MSC_VER >= 1300
 #define HS_SSE2
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "emmintrin.h"
 #endif
 #endif
 #if defined __SSE__ || _MSC_VER >= 1300
 #define HS_SSE1
 #ifndef HS_SIMD_INCLUDE
 # define HS_SIMD_INCLUDE "xmmintrin.h"
 #endif
 #endif
 struct hsCpuId {
    bool has_sse1;
    bool has_sse2;
    bool has_sse3;
    bool has_ssse3;
    bool has_sse41;
    bool has_sse42;
    bool has_avx;
    hsCpuId();
    static const hsCpuId& instance();
 };
 template <typename func_ptr>
 struct hsFunctionDispatcher {
    hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
        hsAssert(fpu, "FPU fallback function required.");
        const hsCpuId& cpu = hsCpuId::instance();
 #ifdef HS_AVX
        if (cpu.has_avx && avx) {
            call = avx;
        } else
 #endif
 #ifdef HS_SSE42
        if (cpu.has_sse42 && sse42) {
            call = sse42;
        } else
 #endif
 #ifdef HS_SSE41
        if (cpu.has_sse41 && sse41) {
            call = sse41;
        } else
 #endif
 #ifdef HS_SSSE3
        if (cpu.has_ssse3 && ssse3) {
            call = ssse3;
        } else
 #endif
 #ifdef HS_SSE3
        if (cpu.has_sse3 && sse3) {
            call = sse3;
        } else
 #endif
 #ifdef HS_SSE2
        if (cpu.has_sse2 && sse2) {
            call = sse2;
        } else
 #endif
 #ifdef HS_SSE1
        if (cpu.has_sse1 && sse1) {
            call = sse1;
        } else
 #endif
        {
            call = fpu;
        }
    };
    func_ptr call;
 };
 #endif  // hsCpuID_inc
--- a/Sources/Plasma/CoreLib/hsMatrix44.cpp
+++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp
@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include "hsStream.h"
 #include <math.h>
-#ifdef HAVE_SSE
+#ifdef HS_SIMD_INCLUDE
-#   include <smmintrin.h>
+#  include HS_SIMD_INCLUDE
 #endif
 static hsMatrix44 myIdent = hsMatrix44().Reset();
 const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
 // CPU-optimized functions requiring dispatch
 hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
 /*
    For the rotation:
         ¦        2     2                                      ¦
@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
    rotate.QuatFromMatrix44(*this);
 }
-#ifdef HAVE_SSE
+hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
 {
    hsMatrix44 c;
    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
    {
        c.Reset();
        return c;
    }
    if( a.fFlags & hsMatrix44::kIsIdent )
        return b;
    if( b.fFlags & hsMatrix44::kIsIdent )
        return a;
    c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]);
    c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]);
    c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]);
    c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]);
    c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]);
    c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]);
    c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]);
    c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]);
    c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]);
    c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]);
    c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]);
    c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]);
    c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]);
    c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]);
    c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]);
    c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]);
    return c;
 }
 #ifdef HS_SSE3
 #   define MULTBEGIN(i) \
-        xmm[0]   = _mm_loadu_ps(fMap[i]);
+        xmm[0]   = _mm_loadu_ps(a.fMap[i]);
 #   define MULTCELL(i, j) \
        xmm[1]   = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \
        xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]);
@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
        xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \
        xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \
        _mm_storeu_ps(c.fMap[i], xmm[1]);
-#endif
+#endif  // HS_SSE3
-hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
+hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
 {
    hsMatrix44 c;
-
+#ifdef HS_SSE3
-    if( fFlags & b.fFlags & hsMatrix44::kIsIdent )
+    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
    {
        c.Reset();
        return c;
    }
-    if( fFlags & hsMatrix44::kIsIdent )
+    if( a.fFlags & hsMatrix44::kIsIdent )
        return b;
    if( b.fFlags & hsMatrix44::kIsIdent )
-        return *this;
+        return a;
 #ifdef HAVE_SSE
    __m128 xmm[8];
    MULTBEGIN(0);
@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
    MULTCELL(3, 2);
    MULTCELL(3, 3);
    MULTFINISH(3);
-#else
+#endif  // HS_SSE3
    c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]);
    c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]);
    c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]);
    c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]);
    c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]);
    c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]);
    c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]);
    c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]);
    c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]);
    c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]);
    c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]);
    c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]);
    c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]);
    c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]);
    c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]);
    c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]);
 #endif
    return c;
 }
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include "HeadSpin.h"
 #include "hsGeometry3.h"
 #include "hsCpuID.h"
 class hsQuat;
@ -140,7 +141,7 @@ struct hsMatrix44 {
                        return rVal;
                    }
    hsVector3 operator*(const hsVector3& p) const;
-    hsMatrix44  operator*(const hsMatrix44& b) const;
+    hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
    hsPoint3*           MapPoints(long count, hsPoint3 points[]) const;
@ -152,6 +153,12 @@ struct hsMatrix44 {
    void Read(hsStream *stream);
    void Write(hsStream *stream);
    //  CPU-optimized functions
    typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
    static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
    static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
    static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
 };
 ////////////////////////////////////////////////////////////////////////////
--- a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
+++ b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
@ -380,7 +380,6 @@ plProfile_CreateTimer("   CIRecalcT", "Object", CIRecalcT);
 plProfile_CreateTimer("   CIDirtyT", "Object", CIDirtyT);
 plProfile_CreateTimer("   CISetT", "Object", CISetT);
 #ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
    hsMatrix44 ret;
@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
    return ret;
 }
 #endif // HAVE_SSE
 void plCoordinateInterface::IRecalcTransforms()
 {
@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms()
    plProfile_BeginTiming(CIRecalcT);
    if( fParent )
    {
 #ifdef HAVE_SSE
        fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent;
        fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal();
 #else
        fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent);
        fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal());
 #endif
    }
    else
    {
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const
 }
 //// SetTransform ////////////////////////////////////////////////////////////
 #ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
    hsMatrix44 ret;
@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
    return ret;
 }
 #endif
 #ifdef MF_TEST_UPDATE
 plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans);
@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w
 #endif // MF_TEST_UPDATE
            for( i = 0; i < spans->GetCount(); i++ )
            {
 #ifdef HAVE_SSE
                fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ];
                fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l;
 #else
                fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]);
                fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l);
-#endif // HAVE_SSE
+
            }
 #ifdef MF_TEST_UPDATE
            plProfile_EndTiming(DSMatTransT);
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include <algorithm>
-#ifdef HAVE_SSE
+#ifdef HS_SIMD_INCLUDE
-#   include <smmintrin.h>
+#  include HS_SIMD_INCLUDE
 #endif
 //#define MF_TOSSER
@ -10525,48 +10525,35 @@ void plDXPipeline::LoadResources()
    plNetClientApp::StaticDebugMsg("End Device Reload");
 }
-// Sorry about this, but it really did speed up the skinning.
+// inlTESTPOINT /////////////////////////////////////////
-// Just some macros for the inner loop of IBlendVertsIntoBuffer.
+// Update mins and maxs if destP is outside.
-#ifdef HAVE_SSE
+inline void inlTESTPOINT(const hsPoint3& destP,
-#   define MATRIXMULTBEGIN(xfm, wgt) \
+                         float& minX, float& minY, float& minZ,
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
+                         float& maxX, float& maxY, float& maxZ)
-        ALIGN(16) float hack[4]; \
+{
-        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
+    if( destP.fX < minX )
-        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
+        minX = destP.fX;
-        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
+    else if( destP.fX > maxX )
-        mwt = _mm_set_ps1(wgt);
+        maxX = destP.fX;
-#   define MATRIXMULTPOINTADD(dst, src) \
+
-        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
+    if( destP.fY < minY )
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+        minY = destP.fY;
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+    else if( destP.fY > maxY )
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
+        maxY = destP.fY;
-        \
+
-        hbuf = _mm_hadd_ps(_x, _y); \
+    if( destP.fZ < minZ )
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
+        minZ = destP.fZ;
-        _mm_store_ps(hack, hbuf); \
+    else if( destP.fZ > maxZ )
-        dst.fX += hack[0]; \
+        maxZ = destP.fZ;
-        dst.fY += hack[1]; \
+}
-        hbuf = _mm_hadd_ps(_z, _z); \
+
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
+//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
-        _mm_store_ps(hack, hbuf); \
+//  Given a pointer into a buffer of verts that have blending data in the D3D
-        dst.fZ += hack[0];
+//  format, blends them into the destination buffer given without the blending
-#   define MATRIXMULTVECTORADD(dst, src) \
+//  info.
-        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
+
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+// FPU version
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
        \
        hbuf = _mm_hadd_ps(_x, _y); \
        hbuf = _mm_hadd_ps(hbuf, hbuf); \
        _mm_store_ps(hack, hbuf); \
        dst.fX += hack[0]; \
        dst.fY += hack[1]; \
        hbuf = _mm_hadd_ps(_z, _z); \
        hbuf = _mm_hadd_ps(hbuf, hbuf); \
        _mm_store_ps(hack, hbuf); \
        dst.fZ += hack[0];
 #else
 #   define MATRIXMULTBEGIN(xfm, wgt) \
        float m00 = xfm.fMap[0][0]; \
        float m01 = xfm.fMap[0][1]; \
        float m02 = xfm.fMap[0][2]; \
@ -10581,7 +10568,7 @@ void plDXPipeline::LoadResources()
        float m23 = xfm.fMap[2][3]; \
        float m_wgt = wgt; \
        float srcX, srcY, srcZ;
-#   define MATRIXMULTPOINTADD(dst, src) \
+#define MATRIXMULTPOINTADD_FPU(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
        srcZ = src.fZ; \
@ -10589,7 +10576,7 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
-#   define MATRIXMULTVECTORADD(dst, src) \
+#define MATRIXMULTVECTORADD_FPU(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
        srcZ = src.fZ; \
@ -10597,218 +10584,250 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
 #endif // HAVE_SSE
-// inlTESTPOINT /////////////////////////////////////////
+// SSE3 version
-// Update mins and maxs if destP is outside.
+#ifdef HS_SSE3
-inline void inlTESTPOINT(const hsPoint3& destP, 
+#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
-                         float& minX, float& minY, float& minZ, 
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
-                         float& maxX, float& maxY, float& maxZ)
+        ALIGN(16) float hack[4]; \
-{
+        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
-    if( destP.fX < minX )
+        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
-        minX = destP.fX;
+        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
-    else if( destP.fX > maxX )
+        mwt = _mm_set_ps1(wgt);
-        maxX = destP.fX;
+#define MATRIXMULTPOINTADD_SSE3(dst, src) \
-
+        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
-    if( destP.fY < minY )
+        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-        minY = destP.fY;
+        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-    else if( destP.fY > maxY )
+        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-        maxY = destP.fY;
+        \
-
+        hbuf1 = _mm_hadd_ps(_x, _y); \
-    if( destP.fZ < minZ )
+        hbuf2 = _mm_hadd_ps(_z, _z); \
-        minZ = destP.fZ;
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-    else if( destP.fZ > maxZ )
+        _mm_store_ps(hack, hbuf1); \
-        maxZ = destP.fZ;
+        dst.fX += hack[0]; \
-}
+        dst.fY += hack[1]; \
-
+        dst.fZ += hack[2];
-//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
+#define MATRIXMULTVECTORADD_SSE3(dst, src) \
-//  Given a pointer into a buffer of verts that have blending data in the D3D
+        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
-//  format, blends them into the destination buffer given without the blending
+        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-//  info.
+        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-
+        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-void    plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, 
+        \
-                                              hsMatrix44* matrixPalette, int numMatrices,
+        hbuf1 = _mm_hadd_ps(_x, _y); \
-                                              const uint8_t *src, uint8_t format, uint32_t srcStride, 
+        hbuf2 = _mm_hadd_ps(_z, _z); \
-                                              uint8_t *dest, uint32_t destStride, uint32_t count,
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
-                                              uint16_t localUVWChans )
+        _mm_store_ps(hack, hbuf1); \
-{
+        dst.fX += hack[0]; \
-    uint8_t       numUVs, numWeights;
+        dst.fY += hack[1]; \
-    uint32_t      i, j, indices, color, specColor, uvChanSize;
+        dst.fZ += hack[2];
-    float       weights[ 4 ], weightSum;
+#endif
    hsPoint3    pt, tempPt, destPt;
    hsVector3   vec, tempNorm, destNorm;
    /// Get some counts
    switch( format & plGBufferGroup::kSkinWeightMask )
    {
        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break;
        case plGBufferGroup::kSkin2Weights: numWeights = 2; break;
        case plGBufferGroup::kSkin3Weights: numWeights = 3; break;
        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" );
    }
    numUVs = plGBufferGroup::CalcNumUVs( format );
    uvChanSize = numUVs * sizeof( float ) * 3;
 //#define MF_RECALC_BOUNDS
 #ifdef MF_RECALC_BOUNDS
    float minX = 1.e33f;
    float minY = 1.e33f;
    float minZ = 1.e33f;
-    float maxX = -1.e33f;
+// CPU-optimized functions requiring dispatch
-    float maxY = -1.e33f;
+hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
    float maxZ = -1.e33f;
 #endif // MF_RECALC_BOUNDS
-    // localUVWChans is bump mapping tangent space vectors, which need to
+// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
 #define BLENDVERTSTART \
    uint8_t     numUVs, numWeights; \
    uint32_t    i, j, indices, color, specColor, uvChanSize; \
    float       weights[ 4 ], weightSum; \
    hsPoint3    pt, tempPt, destPt; \
    hsVector3   vec, tempNorm, destNorm; \
    \
    /* Get some counts */\
    switch( format & plGBufferGroup::kSkinWeightMask ) \
    { \
        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
    } \
    \
    numUVs = plGBufferGroup::CalcNumUVs( format ); \
    uvChanSize = numUVs * sizeof( float ) * 3; \
    \
    /* localUVWChans is bump mapping tangent space vectors, which need to
    // be skinned like the normal, as opposed to passed through like 
    // garden variety UVW coordinates.
    // There are no localUVWChans that I know of in production assets (i.e.
-    // the avatar is not skinned).
+    // the avatar is not skinned).*/\
-    if( !localUVWChans )
+    if( !localUVWChans ) \
-    {
+    { \
-        /// Copy whilst blending
+        /* Copy whilst blending */\
-        for( i = 0; i < count; i++ )
+        for( i = 0; i < count; i++ ) \
-        {
+        { \
-            // Extract data
+            /* Extract data */\
-            src = inlExtractPoint( src, pt );
+            src = inlExtractPoint( src, pt ); \
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
-            {
+            { \
-                src = inlExtractFloat( src, weights[ j ] );
+                src = inlExtractFloat( src, weights[ j ] ); \
-                weightSum += weights[ j ];
+                weightSum += weights[ j ]; \
-            }
+            } \
-            weights[ j ] = 1 - weightSum;
+            weights[ j ] = 1 - weightSum; \
-
+            \
-            if( format & plGBufferGroup::kSkinIndices )
+            if( format & plGBufferGroup::kSkinIndices ) \
-            {
+            { \
-                src = inlExtractUInt32( src, indices );
+                src = inlExtractUInt32( src, indices ); \
-            }
+            } \
-            else
+            else \
-            {
+            { \
-                indices = 1 << 8;
+                indices = 1 << 8; \
-            }
+            } \
-            src = inlExtractPoint( src, vec );
+            src = inlExtractPoint( src, vec ); \
-            src = inlExtractUInt32( src, color );
+            src = inlExtractUInt32( src, color ); \
-            src = inlExtractUInt32( src, specColor );
+            src = inlExtractUInt32( src, specColor ); \
-
+            \
-            // Blend
+            /* Blend */\
-            destPt.Set( 0, 0, 0 );
+            destPt.Set( 0, 0, 0 ); \
-            destNorm.Set( 0, 0, 0 );
+            destNorm.Set( 0, 0, 0 ); \
-            for( j = 0; j < numWeights + 1; j++ )
+            for( j = 0; j < numWeights + 1; j++ ) \
-            {
+            { \
-                if( weights[ j ] )
+                if( weights[ j ] ) \
-                {
+                {
                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
-                }
+                    */
-
+#define BLENDVERTMID \
-                indices >>= 8;
+                } \
-            }
+                \
-            // Probably don't really need to renormalize this. There errors are
+                indices >>= 8; \
-            // going to be subtle and "smooth".
+            } \
-//          hsFastMath::NormalizeAppr(destNorm);
+            /* Probably don't really need to renormalize this. There errors are
-
+            // going to be subtle and "smooth".*/\
-#ifdef MF_RECALC_BOUNDS
+            /* hsFastMath::NormalizeAppr(destNorm);*/ \
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
+            \
-#endif // MF_RECALC_BOUNDS
+            /* Slam data into position now */\
-
+            dest = inlStuffPoint( dest, destPt ); \
-            // Slam data into position now
+            dest = inlStuffPoint( dest, destNorm ); \
-            dest = inlStuffPoint( dest, destPt );
+            dest = inlStuffUInt32( dest, color ); \
-            dest = inlStuffPoint( dest, destNorm );
+            dest = inlStuffUInt32( dest, specColor ); \
-            dest = inlStuffUInt32( dest, color );
+            memcpy( dest, src, uvChanSize ); \
-            dest = inlStuffUInt32( dest, specColor );
+            src += uvChanSize; \
-            memcpy( dest, src, uvChanSize );
+            dest += uvChanSize; \
-            src += uvChanSize;
+        } \
-            dest += uvChanSize;
+    } \
-        }
+    else \
-    }
+    { \
-    else 
+        uint8_t hiChan = localUVWChans >> 8; \
-    {
+        uint8_t loChan = localUVWChans & 0xff; \
-        uint8_t hiChan = localUVWChans >> 8;
+        /* Copy whilst blending */\
-        uint8_t loChan = localUVWChans & 0xff;
+        for( i = 0; i < count; i++ ) \
-        /// Copy whilst blending
+        { \
-        for( i = 0; i < count; i++ )
+            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
-        {
+            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
-            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels];
+            \
-            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels];
+            /* Extract data */\
-
+            src = inlExtractPoint( src, pt ); \
-            // Extract data
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
-            src = inlExtractPoint( src, pt );
+            { \
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
+                src = inlExtractFloat( src, weights[ j ] ); \
-            {
+                weightSum += weights[ j ]; \
-                src = inlExtractFloat( src, weights[ j ] );
+            } \
-                weightSum += weights[ j ];
+            weights[ j ] = 1 - weightSum; \
-            }
+            \
-            weights[ j ] = 1 - weightSum;
+            if( format & plGBufferGroup::kSkinIndices ) \
-
+            { \
-            if( format & plGBufferGroup::kSkinIndices )
+                src = inlExtractUInt32( src, indices ); \
-            {
+            } \
-                src = inlExtractUInt32( src, indices );
+            else \
-            }
+            { \
-            else
+                indices = 1 << 8; \
-            {
+            } \
-                indices = 1 << 8;
+            \
-            }
+            src = inlExtractPoint( src, vec ); \
-
+            src = inlExtractUInt32( src, color ); \
-            src = inlExtractPoint( src, vec );
+            src = inlExtractUInt32( src, specColor ); \
-            src = inlExtractUInt32( src, color );
+            \
-            src = inlExtractUInt32( src, specColor );
+            uint8_t k; \
-
+            for( k = 0; k < numUVs; k++ ) \
-            uint8_t k;
+            { \
-            for( k = 0; k < numUVs; k++ )
+                src = inlExtractPoint( src, srcUVWs[k] ); \
-            {
+            } \
-                src = inlExtractPoint( src, srcUVWs[k] );
+            memcpy( dstUVWs, srcUVWs, uvChanSize); \
-            }
+            dstUVWs[loChan].Set(0,0,0); \
-            memcpy( dstUVWs, srcUVWs, uvChanSize);
+            dstUVWs[hiChan].Set(0,0,0); \
-            dstUVWs[loChan].Set(0,0,0);
+            \
-            dstUVWs[hiChan].Set(0,0,0);
+            /* Blend */\
-
+            destPt.Set( 0, 0, 0 ); \
-            // Blend
+            destNorm.Set( 0, 0, 0 ); \
-            destPt.Set( 0, 0, 0 );
+            for( j = 0; j < numWeights + 1; j++ ) \
-            destNorm.Set( 0, 0, 0 );
+            { \
-            for( j = 0; j < numWeights + 1; j++ )
+                if( weights[ j ] ) \
-            {
+                { \
-                if( weights[ j ] )
+                    /*
                {
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
                    MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
                    */
 #define BLENDVERTEND \
                } \
                \
                indices >>= 8; \
            } \
            /* Probably don't really need to renormalize this. There errors are
            // going to be subtle and "smooth". */\
            /* hsFastMath::NormalizeAppr(destNorm); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
            \
            /* Slam data into position now */\
            dest = inlStuffPoint( dest, destPt ); \
            dest = inlStuffPoint( dest, destNorm ); \
            dest = inlStuffUInt32( dest, color ); \
            dest = inlStuffUInt32( dest, specColor ); \
            memcpy( dest, dstUVWs, uvChanSize ); \
            dest += uvChanSize; \
        } \
    }
-                indices >>= 8;
+void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
-            }
+                                          hsMatrix44* matrixPalette, int numMatrices,
-            // Probably don't really need to renormalize this. There errors are
+                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
-            // going to be subtle and "smooth".
+                                          uint8_t *dest, uint32_t destStride, uint32_t count,
-//          hsFastMath::NormalizeAppr(destNorm);
+                                          uint16_t localUVWChans )
-//          hsFastMath::NormalizeAppr(dstUVWs[loChan]);
+{
-//          hsFastMath::NormalizeAppr(dstUVWs[hiChan]);
+    BLENDVERTSTART
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-#ifdef MF_RECALC_BOUNDS
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
-#endif // MF_RECALC_BOUNDS
+    BLENDVERTMID
                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
-            // Slam data into position now
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
-            dest = inlStuffPoint( dest, destPt );
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
-            dest = inlStuffPoint( dest, destNorm );
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
-            dest = inlStuffUInt32( dest, color );
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
-            dest = inlStuffUInt32( dest, specColor );
+
-            memcpy( dest, dstUVWs, uvChanSize );
+    BLENDVERTEND
-            dest += uvChanSize;
+}
-        }
+
-    }
+void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
-#ifdef MF_RECALC_BOUNDS
+                                           hsMatrix44* matrixPalette, int numMatrices,
-    hsBounds3Ext wBnd;
+                                           const uint8_t *src, uint8_t format, uint32_t srcStride,
-    wBnd.Reset(&hsPoint3(minX, minY, minZ));
+                                           uint8_t *dest, uint32_t destStride, uint32_t count,
-    wBnd.Union(&hsPoint3(maxX, maxY, maxZ));
+                                           uint16_t localUVWChans )
-    span->fWorldBounds = wBnd;
+{
-#endif // MF_RECALC_BOUNDS
+#ifdef HS_SSE3
    BLENDVERTSTART
                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
    BLENDVERTMID
                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
    BLENDVERTEND
 #endif // HS_SSE3
 }
 // ISetPipeConsts //////////////////////////////////////////////////////////////////
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@ -465,7 +465,8 @@ protected:
    void            IBlendVertsIntoBuffer( plSpan* span, 
                                            hsMatrix44* matrixPalette, int numMatrices,
                                            const uint8_t *src, uint8_t format, uint32_t srcStride, 
-                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans );
+                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans )
                                                { blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); };
    hsBool          ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList );
@ -798,6 +799,13 @@ public:
    virtual int                         GetMaxAnisotropicSamples();
    virtual int                         GetMaxAntiAlias(int Width, int Height, int ColorDepth);
    //  CPU-optimized functions
 protected:
    typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
    static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };