Merge pull request #202 from Deledrius/hsCpuID

Add CPU feature detection and function dispatcher
13 years ago · bde67949ac
11 changed files with 588 additions and 327 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -84,12 +84,6 @@ if(MSVC)
    add_definitions(-D_SCL_SECURE_NO_WARNINGS)
 endif(MSVC)

-# TODO: Maybe some kind of automated test here?
-option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
-if(PLASMA_USE_SSE)
-    add_definitions(-DHAVE_SSE)
-endif(PLASMA_USE_SSE)
-
 #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
 include_directories(${OPENSSL_INCLUDE_DIR})

--- a/Sources/Plasma/Apps/plClient/winmain.cpp
+++ b/Sources/Plasma/Apps/plClient/winmain.cpp
@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
    #include <dmdfm.h>      // Windows Load EXE into memory suff
 #endif

-#ifdef HAVE_SSE
-#   include <intrin.h>
-#endif
-
 #include <curl/curl.h>

 #include "HeadSpin.h"
@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
 }
 #endif

-bool CheckCPU()
-{
-    const unsigned int sse3_flag = 0x00000001;
-    // (any other CPU features...)
-
-    int cpu_info[4];
-    __cpuid(cpu_info, 1);
-#ifdef HAVE_SSE
-    if((cpu_info[2] & sse3_flag) == 0)
-        return false;
-#endif
-    // Insert additional feature checks here
-
-    return true;
-}
-
 #include "pfConsoleCore/pfConsoleEngine.h"
 PF_CONSOLE_LINK_ALL()

 int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
 {
-    // Check to make sure we have a good CPU before getting started
-    if (!CheckCPU())
-    {
-        plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName());
-        hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
-        return PARABLE_NORMAL_EXIT;
-    }
-
    PF_CONSOLE_INIT_ALL()

    // Set global handle
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@ -15,6 +15,7 @@ set(CoreLib_SOURCES
    HeadSpin.cpp
    hsBitVector.cpp
    hsBounds.cpp
+    hsCpuID.cpp
    hsCritSect.cpp
    hsExceptionStack.cpp
    hsFastMath.cpp
@ -57,6 +58,7 @@ set(CoreLib_HEADERS
    hsBitVector.h
    hsBounds.h
    hsColorRGBA.h
+    hsCpuID.h
    hsCritSect.h
    hsExceptions.h
    hsFastMath.h
--- a/Sources/Plasma/CoreLib/hsCpuID.cpp
+++ b/Sources/Plasma/CoreLib/hsCpuID.cpp
@ -0,0 +1,71 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011  Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+      Cyan Worlds, Inc.
+      14617 N Newport Hwy
+      Mead, WA   99021
+
+*==LICENSE==*/
+
+#include <intrin.h>
+
+#include "hsCpuID.h"
+
+hsCpuId::hsCpuId() {
+    const unsigned int sse1_flag = 1<<25;
+    const unsigned int sse2_flag = 1<<26;
+    const unsigned int sse3_flag = 1<<0;
+    const unsigned int ssse3_flag = 1<<9;
+    const unsigned int sse41_flag = 1<<19;
+    const unsigned int sse42_flag = 1<<20;
+    const unsigned int avx_flag = 1 << 28;
+
+    unsigned int cpu_info[4];
+    __cpuid((int*)cpu_info, 1);
+    has_sse1 = (cpu_info[3] & sse1_flag) || false;
+    has_sse2 = (cpu_info[3] & sse2_flag) || false;
+    has_sse3 = (cpu_info[2] & sse3_flag) || false;
+    has_ssse3 = (cpu_info[2] & ssse3_flag) || false;
+    has_sse41 = (cpu_info[2] & sse41_flag) || false;
+    has_sse42 = (cpu_info[2] & sse42_flag) || false;
+    has_avx = (cpu_info[2] & avx_flag) || false;
+}
+
+const hsCpuId& hsCpuId::instance()
+{
+    static hsCpuId self;
+    return self;
+}
--- a/Sources/Plasma/CoreLib/hsCpuID.h
+++ b/Sources/Plasma/CoreLib/hsCpuID.h
@ -0,0 +1,182 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011  Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+      Cyan Worlds, Inc.
+      14617 N Newport Hwy
+      Mead, WA   99021
+
+*==LICENSE==*/
+
+//////////////////////////////////////////////////////////////////////
+//
+// hsCpuID - Processor feature detection and function dispatcher
+//
+//
+//     == Example Usage ==
+//
+//  #ifdef HS_SIMD_INCLUDE
+//  #   include HS_SIMD_INCLUDE
+//  #endif
+//
+//  float my_func_fpu() {
+//    ...
+//  }
+//
+//  float my_func_avx() {
+//  #ifdef HS_AVX
+//    ...
+//  #endif
+//  }
+//
+//
+//  typedef float(*func_ptr)();
+//  static hsFunctionDispatcher<func_ptr> my_func;
+//
+//  hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
+//
+//////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef hsCpuID_inc
+#define hsCpuID_inc
+
+#if defined __AVX__ || _MSC_VER >= 1600
+#define HS_AVX
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "immintrin.h"
+#endif
+#endif
+#if defined __SSE4_2__ || _MSC_VER >= 1600
+#define HS_SSE42
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "nmmintrin.h"
+#endif
+#endif
+#if defined __SSE4_1__ || _MSC_VER >= 1600
+#define HS_SSE41
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "smmintrin.h"
+#endif
+#endif
+#if defined __SSSE3__ || _MSC_VER >= 1600
+#define HS_SSSE3
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "tmmintrin.h"
+#endif
+#endif
+#if defined __SSE3__ || _MSC_VER >= 1400
+#define HS_SSE3
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "pmmintrin.h"
+#endif
+#endif
+#if defined __SSE2__ || _MSC_VER >= 1300
+#define HS_SSE2
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "emmintrin.h"
+#endif
+#endif
+#if defined __SSE__ || _MSC_VER >= 1300
+#define HS_SSE1
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "xmmintrin.h"
+#endif
+#endif
+
+
+struct hsCpuId {
+    bool has_sse1;
+    bool has_sse2;
+    bool has_sse3;
+    bool has_ssse3;
+    bool has_sse41;
+    bool has_sse42;
+    bool has_avx;
+
+    hsCpuId();
+    static const hsCpuId& instance();
+};
+
+template <typename func_ptr>
+struct hsFunctionDispatcher {
+    hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
+        hsAssert(fpu, "FPU fallback function required.");
+        const hsCpuId& cpu = hsCpuId::instance();
+#ifdef HS_AVX
+        if (cpu.has_avx && avx) {
+            call = avx;
+        } else
+#endif
+#ifdef HS_SSE42
+        if (cpu.has_sse42 && sse42) {
+            call = sse42;
+        } else
+#endif
+#ifdef HS_SSE41
+        if (cpu.has_sse41 && sse41) {
+            call = sse41;
+        } else
+#endif
+#ifdef HS_SSSE3
+        if (cpu.has_ssse3 && ssse3) {
+            call = ssse3;
+        } else
+#endif
+#ifdef HS_SSE3
+        if (cpu.has_sse3 && sse3) {
+            call = sse3;
+        } else
+#endif
+#ifdef HS_SSE2
+        if (cpu.has_sse2 && sse2) {
+            call = sse2;
+        } else
+#endif
+#ifdef HS_SSE1
+        if (cpu.has_sse1 && sse1) {
+            call = sse1;
+        } else
+#endif
+        {
+            call = fpu;
+        }
+    };
+    func_ptr call;
+};
+
+
+#endif  // hsCpuID_inc
--- a/Sources/Plasma/CoreLib/hsMatrix44.cpp
+++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp
@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include "hsStream.h"
 #include <math.h>

-#ifdef HAVE_SSE
-#   include <smmintrin.h>
+#ifdef HS_SIMD_INCLUDE
+#  include HS_SIMD_INCLUDE
 #endif

 static hsMatrix44 myIdent = hsMatrix44().Reset();
 const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }

+// CPU-optimized functions requiring dispatch
+hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
+
 /*
    For the rotation:
         ¦        2     2                                      ¦
@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
    rotate.QuatFromMatrix44(*this);
 }

-#ifdef HAVE_SSE
+hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
+{
+    hsMatrix44 c;
+
+    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
+    {
+        c.Reset();
+        return c;
+    }
+
+    if( a.fFlags & hsMatrix44::kIsIdent )
+        return b;
+    if( b.fFlags & hsMatrix44::kIsIdent )
+        return a;
+
+    c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]);
+    c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]);
+    c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]);
+    c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]);
+
+    c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]);
+    c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]);
+    c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]);
+    c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]);
+
+    c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]);
+    c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]);
+    c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]);
+    c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]);
+
+    c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]);
+    c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]);
+    c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]);
+    c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]);
+
+    return c;
+}
+
+#ifdef HS_SSE3
 #   define MULTBEGIN(i) \
-        xmm[0]   = _mm_loadu_ps(fMap[i]);
+        xmm[0]   = _mm_loadu_ps(a.fMap[i]);
 #   define MULTCELL(i, j) \
        xmm[1]   = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \
        xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]);
@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
        xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \
        xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \
        _mm_storeu_ps(c.fMap[i], xmm[1]);
-#endif
+#endif  // HS_SSE3

-hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
+hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
 {
    hsMatrix44 c;
-
-    if( fFlags & b.fFlags & hsMatrix44::kIsIdent )
+#ifdef HS_SSE3
+    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
    {
        c.Reset();
        return c;
    }

-    if( fFlags & hsMatrix44::kIsIdent )
+    if( a.fFlags & hsMatrix44::kIsIdent )
        return b;
    if( b.fFlags & hsMatrix44::kIsIdent )
-        return *this;
+        return a;

-#ifdef HAVE_SSE
    __m128 xmm[8];

    MULTBEGIN(0);
@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
    MULTCELL(3, 2);
    MULTCELL(3, 3);
    MULTFINISH(3);
-#else
-    c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]);
-    c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]);
-    c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]);
-    c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]);
-
-    c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]);
-    c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]);
-    c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]);
-    c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]);
-    
-    c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]);
-    c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]);
-    c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]);
-    c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]);
-    
-    c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]);
-    c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]);
-    c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]);
-    c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]);
-#endif
-
+#endif  // HS_SSE3
    return c;
 }

--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com

 #include "HeadSpin.h"
 #include "hsGeometry3.h"
+#include "hsCpuID.h"

 class hsQuat;

@ -140,7 +141,7 @@ struct hsMatrix44 {
                        return rVal;
                    }
    hsVector3 operator*(const hsVector3& p) const;
-    hsMatrix44  operator*(const hsMatrix44& b) const;
+    hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
    
    hsPoint3*           MapPoints(long count, hsPoint3 points[]) const;
    
@ -152,6 +153,12 @@ struct hsMatrix44 {

    void Read(hsStream *stream);
    void Write(hsStream *stream);
+
+    //  CPU-optimized functions
+    typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
+    static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
+    static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
+    static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
 };

 ////////////////////////////////////////////////////////////////////////////
--- a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
+++ b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
@ -380,7 +380,6 @@ plProfile_CreateTimer("   CIRecalcT", "Object", CIRecalcT);
 plProfile_CreateTimer("   CIDirtyT", "Object", CIDirtyT);
 plProfile_CreateTimer("   CISetT", "Object", CISetT);

-#ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
    hsMatrix44 ret;
@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r

    return ret;
 }
-#endif // HAVE_SSE

 void plCoordinateInterface::IRecalcTransforms()
 {
@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms()
    plProfile_BeginTiming(CIRecalcT);
    if( fParent )
    {
-#ifdef HAVE_SSE
-        fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent;
-        fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal();
-#else
        fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent);
        fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal());
-#endif
    }
    else
    {
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const
 }

 //// SetTransform ////////////////////////////////////////////////////////////
-#ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
    hsMatrix44 ret;
@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r

    return ret;
 }
-#endif

 #ifdef MF_TEST_UPDATE
 plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans);
@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w
 #endif // MF_TEST_UPDATE
            for( i = 0; i < spans->GetCount(); i++ )
            {
-#ifdef HAVE_SSE
-                fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ];
-                fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l;
-#else
                fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]);
                fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l);
-#endif // HAVE_SSE
+
            }
 #ifdef MF_TEST_UPDATE
            plProfile_EndTiming(DSMatTransT);
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com

 #include <algorithm>

-#ifdef HAVE_SSE
-#   include <smmintrin.h>
+#ifdef HS_SIMD_INCLUDE
+#  include HS_SIMD_INCLUDE
 #endif

 //#define MF_TOSSER
@ -10525,48 +10525,35 @@ void plDXPipeline::LoadResources()
    plNetClientApp::StaticDebugMsg("End Device Reload");
 }

-// Sorry about this, but it really did speed up the skinning.
-// Just some macros for the inner loop of IBlendVertsIntoBuffer.
-#ifdef HAVE_SSE
-#   define MATRIXMULTBEGIN(xfm, wgt) \
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
-        ALIGN(16) float hack[4]; \
-        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
-        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
-        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
-        mwt = _mm_set_ps1(wgt);
-#   define MATRIXMULTPOINTADD(dst, src) \
-        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-        \
-        hbuf = _mm_hadd_ps(_x, _y); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fX += hack[0]; \
-        dst.fY += hack[1]; \
-        hbuf = _mm_hadd_ps(_z, _z); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fZ += hack[0];
-#   define MATRIXMULTVECTORADD(dst, src) \
-        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
-        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
-        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
-        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
-        \
-        hbuf = _mm_hadd_ps(_x, _y); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fX += hack[0]; \
-        dst.fY += hack[1]; \
-        hbuf = _mm_hadd_ps(_z, _z); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fZ += hack[0];
-#else
-#   define MATRIXMULTBEGIN(xfm, wgt) \
+// inlTESTPOINT /////////////////////////////////////////
+// Update mins and maxs if destP is outside.
+inline void inlTESTPOINT(const hsPoint3& destP,
+                         float& minX, float& minY, float& minZ,
+                         float& maxX, float& maxY, float& maxZ)
+{
+    if( destP.fX < minX )
+        minX = destP.fX;
+    else if( destP.fX > maxX )
+        maxX = destP.fX;
+
+    if( destP.fY < minY )
+        minY = destP.fY;
+    else if( destP.fY > maxY )
+        maxY = destP.fY;
+
+    if( destP.fZ < minZ )
+        minZ = destP.fZ;
+    else if( destP.fZ > maxZ )
+        maxZ = destP.fZ;
+}
+
+//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
+//  Given a pointer into a buffer of verts that have blending data in the D3D
+//  format, blends them into the destination buffer given without the blending
+//  info.
+
+// FPU version
+#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
        float m00 = xfm.fMap[0][0]; \
        float m01 = xfm.fMap[0][1]; \
        float m02 = xfm.fMap[0][2]; \
@ -10581,7 +10568,7 @@ void plDXPipeline::LoadResources()
        float m23 = xfm.fMap[2][3]; \
        float m_wgt = wgt; \
        float srcX, srcY, srcZ;
-#   define MATRIXMULTPOINTADD(dst, src) \
+#define MATRIXMULTPOINTADD_FPU(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
        srcZ = src.fZ; \
@ -10589,7 +10576,7 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
-#   define MATRIXMULTVECTORADD(dst, src) \
+#define MATRIXMULTVECTORADD_FPU(dst, src) \
        srcX = src.fX; \
        srcY = src.fY; \
        srcZ = src.fZ; \
@ -10597,218 +10584,250 @@ void plDXPipeline::LoadResources()
        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
-#endif // HAVE_SSE
-
-// inlTESTPOINT /////////////////////////////////////////
-// Update mins and maxs if destP is outside.
-inline void inlTESTPOINT(const hsPoint3& destP, 
-                         float& minX, float& minY, float& minZ, 
-                         float& maxX, float& maxY, float& maxZ)
-{
-    if( destP.fX < minX )
-        minX = destP.fX;
-    else if( destP.fX > maxX )
-        maxX = destP.fX;
-
-    if( destP.fY < minY )
-        minY = destP.fY;
-    else if( destP.fY > maxY )
-        maxY = destP.fY;
-
-    if( destP.fZ < minZ )
-        minZ = destP.fZ;
-    else if( destP.fZ > maxZ )
-        maxZ = destP.fZ;
-}
-
-//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
-//  Given a pointer into a buffer of verts that have blending data in the D3D
-//  format, blends them into the destination buffer given without the blending
-//  info.
-
-void    plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, 
-                                              hsMatrix44* matrixPalette, int numMatrices,
-                                              const uint8_t *src, uint8_t format, uint32_t srcStride, 
-                                              uint8_t *dest, uint32_t destStride, uint32_t count,
-                                              uint16_t localUVWChans )
-{
-    uint8_t       numUVs, numWeights;
-    uint32_t      i, j, indices, color, specColor, uvChanSize;
-    float       weights[ 4 ], weightSum;
-    hsPoint3    pt, tempPt, destPt;
-    hsVector3   vec, tempNorm, destNorm;
-

-    /// Get some counts
-    switch( format & plGBufferGroup::kSkinWeightMask )
-    {
-        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break;
-        case plGBufferGroup::kSkin2Weights: numWeights = 2; break;
-        case plGBufferGroup::kSkin3Weights: numWeights = 3; break;
-        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" );
-    }
-
-    numUVs = plGBufferGroup::CalcNumUVs( format );
-    uvChanSize = numUVs * sizeof( float ) * 3;
-
-//#define MF_RECALC_BOUNDS
-#ifdef MF_RECALC_BOUNDS
-    float minX = 1.e33f;
-    float minY = 1.e33f;
-    float minZ = 1.e33f;
+// SSE3 version
+#ifdef HS_SSE3
+#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
+        ALIGN(16) float hack[4]; \
+        mc0 = _mm_loadu_ps(xfm.fMap[0]); \
+        mc1 = _mm_loadu_ps(xfm.fMap[1]); \
+        mc2 = _mm_loadu_ps(xfm.fMap[2]); \
+        mwt = _mm_set_ps1(wgt);
+#define MATRIXMULTPOINTADD_SSE3(dst, src) \
+        msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
+        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
+        \
+        hbuf1 = _mm_hadd_ps(_x, _y); \
+        hbuf2 = _mm_hadd_ps(_z, _z); \
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
+        _mm_store_ps(hack, hbuf1); \
+        dst.fX += hack[0]; \
+        dst.fY += hack[1]; \
+        dst.fZ += hack[2];
+#define MATRIXMULTVECTORADD_SSE3(dst, src) \
+        msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
+        _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
+        _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
+        _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
+        \
+        hbuf1 = _mm_hadd_ps(_x, _y); \
+        hbuf2 = _mm_hadd_ps(_z, _z); \
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
+        _mm_store_ps(hack, hbuf1); \
+        dst.fX += hack[0]; \
+        dst.fY += hack[1]; \
+        dst.fZ += hack[2];
+#endif

-    float maxX = -1.e33f;
-    float maxY = -1.e33f;
-    float maxZ = -1.e33f;
-#endif // MF_RECALC_BOUNDS
+// CPU-optimized functions requiring dispatch
+hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);

-    // localUVWChans is bump mapping tangent space vectors, which need to
+// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
+#define BLENDVERTSTART \
+    uint8_t     numUVs, numWeights; \
+    uint32_t    i, j, indices, color, specColor, uvChanSize; \
+    float       weights[ 4 ], weightSum; \
+    hsPoint3    pt, tempPt, destPt; \
+    hsVector3   vec, tempNorm, destNorm; \
+    \
+    /* Get some counts */\
+    switch( format & plGBufferGroup::kSkinWeightMask ) \
+    { \
+        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
+        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
+        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
+        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
+    } \
+    \
+    numUVs = plGBufferGroup::CalcNumUVs( format ); \
+    uvChanSize = numUVs * sizeof( float ) * 3; \
+    \
+    /* localUVWChans is bump mapping tangent space vectors, which need to
    // be skinned like the normal, as opposed to passed through like 
    // garden variety UVW coordinates.
    // There are no localUVWChans that I know of in production assets (i.e.
-    // the avatar is not skinned).
-    if( !localUVWChans )
-    {
-        /// Copy whilst blending
-        for( i = 0; i < count; i++ )
-        {
-            // Extract data
-            src = inlExtractPoint( src, pt );
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
-            {
-                src = inlExtractFloat( src, weights[ j ] );
-                weightSum += weights[ j ];
-            }
-            weights[ j ] = 1 - weightSum;
-
-            if( format & plGBufferGroup::kSkinIndices )
-            {
-                src = inlExtractUInt32( src, indices );
-            }
-            else
-            {
-                indices = 1 << 8;
-            }
-            src = inlExtractPoint( src, vec );
-            src = inlExtractUInt32( src, color );
-            src = inlExtractUInt32( src, specColor );
-
-            // Blend
-            destPt.Set( 0, 0, 0 );
-            destNorm.Set( 0, 0, 0 );
-            for( j = 0; j < numWeights + 1; j++ )
-            {
-                if( weights[ j ] )
+    // the avatar is not skinned).*/\
+    if( !localUVWChans ) \
+    { \
+        /* Copy whilst blending */\
+        for( i = 0; i < count; i++ ) \
+        { \
+            /* Extract data */\
+            src = inlExtractPoint( src, pt ); \
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
+            { \
+                src = inlExtractFloat( src, weights[ j ] ); \
+                weightSum += weights[ j ]; \
+            } \
+            weights[ j ] = 1 - weightSum; \
+            \
+            if( format & plGBufferGroup::kSkinIndices ) \
+            { \
+                src = inlExtractUInt32( src, indices ); \
+            } \
+            else \
+            { \
+                indices = 1 << 8; \
+            } \
+            src = inlExtractPoint( src, vec ); \
+            src = inlExtractUInt32( src, color ); \
+            src = inlExtractUInt32( src, specColor ); \
+            \
+            /* Blend */\
+            destPt.Set( 0, 0, 0 ); \
+            destNorm.Set( 0, 0, 0 ); \
+            for( j = 0; j < numWeights + 1; j++ ) \
+            { \
+                if( weights[ j ] ) \
                {
+                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);

                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
-                }
-
-                indices >>= 8;
-            }
-            // Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth".
-//          hsFastMath::NormalizeAppr(destNorm);
-
-#ifdef MF_RECALC_BOUNDS
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
-#endif // MF_RECALC_BOUNDS
-
-            // Slam data into position now
-            dest = inlStuffPoint( dest, destPt );
-            dest = inlStuffPoint( dest, destNorm );
-            dest = inlStuffUInt32( dest, color );
-            dest = inlStuffUInt32( dest, specColor );
-            memcpy( dest, src, uvChanSize );
-            src += uvChanSize;
-            dest += uvChanSize;
-        }
-    }
-    else 
-    {
-        uint8_t hiChan = localUVWChans >> 8;
-        uint8_t loChan = localUVWChans & 0xff;
-        /// Copy whilst blending
-        for( i = 0; i < count; i++ )
-        {
-            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels];
-            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels];
-
-            // Extract data
-            src = inlExtractPoint( src, pt );
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
-            {
-                src = inlExtractFloat( src, weights[ j ] );
-                weightSum += weights[ j ];
-            }
-            weights[ j ] = 1 - weightSum;
-
-            if( format & plGBufferGroup::kSkinIndices )
-            {
-                src = inlExtractUInt32( src, indices );
-            }
-            else
-            {
-                indices = 1 << 8;
-            }
-
-            src = inlExtractPoint( src, vec );
-            src = inlExtractUInt32( src, color );
-            src = inlExtractUInt32( src, specColor );
-
-            uint8_t k;
-            for( k = 0; k < numUVs; k++ )
-            {
-                src = inlExtractPoint( src, srcUVWs[k] );
-            }
-            memcpy( dstUVWs, srcUVWs, uvChanSize);
-            dstUVWs[loChan].Set(0,0,0);
-            dstUVWs[hiChan].Set(0,0,0);
-
-            // Blend
-            destPt.Set( 0, 0, 0 );
-            destNorm.Set( 0, 0, 0 );
-            for( j = 0; j < numWeights + 1; j++ )
-            {
-                if( weights[ j ] )
-                {
+                    */
+#define BLENDVERTMID \
+                } \
+                \
+                indices >>= 8; \
+            } \
+            /* Probably don't really need to renormalize this. There errors are
+            // going to be subtle and "smooth".*/\
+            /* hsFastMath::NormalizeAppr(destNorm);*/ \
+            \
+            /* Slam data into position now */\
+            dest = inlStuffPoint( dest, destPt ); \
+            dest = inlStuffPoint( dest, destNorm ); \
+            dest = inlStuffUInt32( dest, color ); \
+            dest = inlStuffUInt32( dest, specColor ); \
+            memcpy( dest, src, uvChanSize ); \
+            src += uvChanSize; \
+            dest += uvChanSize; \
+        } \
+    } \
+    else \
+    { \
+        uint8_t hiChan = localUVWChans >> 8; \
+        uint8_t loChan = localUVWChans & 0xff; \
+        /* Copy whilst blending */\
+        for( i = 0; i < count; i++ ) \
+        { \
+            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
+            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
+            \
+            /* Extract data */\
+            src = inlExtractPoint( src, pt ); \
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
+            { \
+                src = inlExtractFloat( src, weights[ j ] ); \
+                weightSum += weights[ j ]; \
+            } \
+            weights[ j ] = 1 - weightSum; \
+            \
+            if( format & plGBufferGroup::kSkinIndices ) \
+            { \
+                src = inlExtractUInt32( src, indices ); \
+            } \
+            else \
+            { \
+                indices = 1 << 8; \
+            } \
+            \
+            src = inlExtractPoint( src, vec ); \
+            src = inlExtractUInt32( src, color ); \
+            src = inlExtractUInt32( src, specColor ); \
+            \
+            uint8_t k; \
+            for( k = 0; k < numUVs; k++ ) \
+            { \
+                src = inlExtractPoint( src, srcUVWs[k] ); \
+            } \
+            memcpy( dstUVWs, srcUVWs, uvChanSize); \
+            dstUVWs[loChan].Set(0,0,0); \
+            dstUVWs[hiChan].Set(0,0,0); \
+            \
+            /* Blend */\
+            destPt.Set( 0, 0, 0 ); \
+            destNorm.Set( 0, 0, 0 ); \
+            for( j = 0; j < numWeights + 1; j++ ) \
+            { \
+                if( weights[ j ] ) \
+                { \
+                    /*
                    MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);

                    MATRIXMULTPOINTADD(destPt, pt);
                    MATRIXMULTVECTORADD(destNorm, vec);
                    MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
                    MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
+                    */
+#define BLENDVERTEND \
+                } \
+                \
+                indices >>= 8; \
+            } \
+            /* Probably don't really need to renormalize this. There errors are
+            // going to be subtle and "smooth". */\
+            /* hsFastMath::NormalizeAppr(destNorm); */\
+            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
+            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
+            \
+            /* Slam data into position now */\
+            dest = inlStuffPoint( dest, destPt ); \
+            dest = inlStuffPoint( dest, destNorm ); \
+            dest = inlStuffUInt32( dest, color ); \
+            dest = inlStuffUInt32( dest, specColor ); \
+            memcpy( dest, dstUVWs, uvChanSize ); \
+            dest += uvChanSize; \
+        } \
    }

-                indices >>= 8;
-            }
-            // Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth".
-//          hsFastMath::NormalizeAppr(destNorm);
-//          hsFastMath::NormalizeAppr(dstUVWs[loChan]);
-//          hsFastMath::NormalizeAppr(dstUVWs[hiChan]);
+void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
+                                          hsMatrix44* matrixPalette, int numMatrices,
+                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
+                                          uint8_t *dest, uint32_t destStride, uint32_t count,
+                                          uint16_t localUVWChans )
+{
+    BLENDVERTSTART
+                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);

-#ifdef MF_RECALC_BOUNDS
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
-#endif // MF_RECALC_BOUNDS
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+    BLENDVERTMID
+                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);

-            // Slam data into position now
-            dest = inlStuffPoint( dest, destPt );
-            dest = inlStuffPoint( dest, destNorm );
-            dest = inlStuffUInt32( dest, color );
-            dest = inlStuffUInt32( dest, specColor );
-            memcpy( dest, dstUVWs, uvChanSize );
-            dest += uvChanSize;
-        }
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
+
+    BLENDVERTEND
 }
-#ifdef MF_RECALC_BOUNDS
-    hsBounds3Ext wBnd;
-    wBnd.Reset(&hsPoint3(minX, minY, minZ));
-    wBnd.Union(&hsPoint3(maxX, maxY, maxZ));
-    span->fWorldBounds = wBnd;
-#endif // MF_RECALC_BOUNDS
+
+void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
+                                           hsMatrix44* matrixPalette, int numMatrices,
+                                           const uint8_t *src, uint8_t format, uint32_t srcStride,
+                                           uint8_t *dest, uint32_t destStride, uint32_t count,
+                                           uint16_t localUVWChans )
+{
+#ifdef HS_SSE3
+    BLENDVERTSTART
+                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+    BLENDVERTMID
+                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
+                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
+    BLENDVERTEND
+#endif // HS_SSE3
 }

 // ISetPipeConsts //////////////////////////////////////////////////////////////////
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@ -465,7 +465,8 @@ protected:
    void            IBlendVertsIntoBuffer( plSpan* span, 
                                            hsMatrix44* matrixPalette, int numMatrices,
                                            const uint8_t *src, uint8_t format, uint32_t srcStride, 
-                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans );
+                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans )
+                                                { blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); };
    hsBool          ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList );


@ -798,6 +799,13 @@ public:
    virtual int                         GetMaxAnisotropicSamples();
    virtual int                         GetMaxAntiAlias(int Width, int Height, int ColorDepth);

+
+    //  CPU-optimized functions
+protected:
+    typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };