From 42f4eec1f7c7e3c5adc53b64b1f55d96b30783c0 Mon Sep 17 00:00:00 2001
From: Joseph Davies <deledrius@gmail.com>
Date: Sun, 22 Apr 2012 14:43:47 -0700
Subject: [PATCH 1/4] Add hsCpuID and hsFunctionDispatcher for CPU feature
 detection.

Based on Branan's code deliberated upon in IRC, this provides the ability
to detect CPU instruction sets and dispatch functions based on those
capabilities detected at runtime.
---
 Sources/Plasma/CoreLib/CMakeLists.txt |   2 +
 Sources/Plasma/CoreLib/hsCpuID.cpp    |  71 ++++++++++
 Sources/Plasma/CoreLib/hsCpuID.h      | 182 ++++++++++++++++++++++++++
 3 files changed, 255 insertions(+)
 create mode 100644 Sources/Plasma/CoreLib/hsCpuID.cpp
 create mode 100644 Sources/Plasma/CoreLib/hsCpuID.h

diff --git a/Sources/Plasma/CoreLib/CMakeLists.txt b/Sources/Plasma/CoreLib/CMakeLists.txt
index e4cf8278..d99906de 100644
--- a/Sources/Plasma/CoreLib/CMakeLists.txt
+++ b/Sources/Plasma/CoreLib/CMakeLists.txt
@@ -15,6 +15,7 @@ set(CoreLib_SOURCES
     HeadSpin.cpp
     hsBitVector.cpp
     hsBounds.cpp
+    hsCpuID.cpp
     hsCritSect.cpp
     hsExceptionStack.cpp
     hsFastMath.cpp
@@ -57,6 +58,7 @@ set(CoreLib_HEADERS
     hsBitVector.h
     hsBounds.h
     hsColorRGBA.h
+    hsCpuID.h
     hsCritSect.h
     hsExceptions.h
     hsFastMath.h
diff --git a/Sources/Plasma/CoreLib/hsCpuID.cpp b/Sources/Plasma/CoreLib/hsCpuID.cpp
new file mode 100644
index 00000000..12c99f9b
--- /dev/null
+++ b/Sources/Plasma/CoreLib/hsCpuID.cpp
@@ -0,0 +1,71 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011  Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+      Cyan Worlds, Inc.
+      14617 N Newport Hwy
+      Mead, WA   99021
+
+*==LICENSE==*/
+
+#include <intrin.h>
+
+#include "hsCpuID.h"
+
+hsCpuId::hsCpuId() {
+    const unsigned int sse1_flag = 1<<25;
+    const unsigned int sse2_flag = 1<<26;
+    const unsigned int sse3_flag = 1<<0;
+    const unsigned int ssse3_flag = 1<<9;
+    const unsigned int sse41_flag = 1<<19;
+    const unsigned int sse42_flag = 1<<20;
+    const unsigned int avx_flag = 1 << 28;
+
+    unsigned int cpu_info[4];
+    __cpuid((int*)cpu_info, 1);
+    has_sse1 = (cpu_info[3] & sse1_flag) || false;
+    has_sse2 = (cpu_info[3] & sse2_flag) || false;
+    has_sse3 = (cpu_info[2] & sse3_flag) || false;
+    has_ssse3 = (cpu_info[2] & ssse3_flag) || false;
+    has_sse41 = (cpu_info[2] & sse41_flag) || false;
+    has_sse42 = (cpu_info[2] & sse42_flag) || false;
+    has_avx = (cpu_info[2] & avx_flag) || false;
+}
+
+const hsCpuId& hsCpuId::instance()
+{
+    static hsCpuId self;
+    return self;
+}
diff --git a/Sources/Plasma/CoreLib/hsCpuID.h b/Sources/Plasma/CoreLib/hsCpuID.h
new file mode 100644
index 00000000..6c5f8385
--- /dev/null
+++ b/Sources/Plasma/CoreLib/hsCpuID.h
@@ -0,0 +1,182 @@
+/*==LICENSE==*
+
+CyanWorlds.com Engine - MMOG client, server and tools
+Copyright (C) 2011  Cyan Worlds, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permissions under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
+NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
+JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
+(or a modified version of those libraries),
+containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
+PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
+JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
+licensors of this Program grant you additional
+permission to convey the resulting work. Corresponding Source for a
+non-source form of such a combination shall include the source code for
+the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
+work.
+
+You can contact Cyan Worlds, Inc. by email legal@cyan.com
+ or by snail mail at:
+      Cyan Worlds, Inc.
+      14617 N Newport Hwy
+      Mead, WA   99021
+
+*==LICENSE==*/
+
+//////////////////////////////////////////////////////////////////////
+//
+// hsCpuID - Processor feature detection and function dispatcher
+//
+//
+//     == Example Usage ==
+//
+//  #ifdef HS_SIMD_INCLUDE
+//  #   include HS_SIMD_INCLUDE
+//  #endif
+//
+//  float my_func_fpu() {
+//    ...
+//  }
+//
+//  float my_func_avx() {
+//  #ifdef HS_AVX
+//    ...
+//  #endif
+//  }
+//
+//
+//  typedef float(*func_ptr)();
+//  static hsFunctionDispatcher<func_ptr> my_func;
+//
+//  hsFunctionDispatcher<float::func_ptr> float::my_func(float::my_func_fpu, 0, 0, 0, 0, 0, 0, float::my_func_avx);
+//
+//////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef hsCpuID_inc
+#define hsCpuID_inc
+
+#if defined __AVX__ || _MSC_VER >= 1600
+#define HS_AVX
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "immintrin.h"
+#endif
+#endif
+#if defined __SSE4_2__ || _MSC_VER >= 1600
+#define HS_SSE42
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "nmmintrin.h"
+#endif
+#endif
+#if defined __SSE4_1__ || _MSC_VER >= 1600
+#define HS_SSE41
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "smmintrin.h"
+#endif
+#endif
+#if defined __SSSE3__ || _MSC_VER >= 1600
+#define HS_SSSE3
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "tmmintrin.h"
+#endif
+#endif
+#if defined __SSE3__ || _MSC_VER >= 1400
+#define HS_SSE3
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "pmmintrin.h"
+#endif
+#endif
+#if defined __SSE2__ || _MSC_VER >= 1300
+#define HS_SSE2
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "emmintrin.h"
+#endif
+#endif
+#if defined __SSE__ || _MSC_VER >= 1300
+#define HS_SSE1
+#ifndef HS_SIMD_INCLUDE
+# define HS_SIMD_INCLUDE "xmmintrin.h"
+#endif
+#endif
+
+
+struct hsCpuId {
+    bool has_sse1;
+    bool has_sse2;
+    bool has_sse3;
+    bool has_ssse3;
+    bool has_sse41;
+    bool has_sse42;
+    bool has_avx;
+
+    hsCpuId();
+    static const hsCpuId& instance();
+};
+
+template <typename func_ptr>
+struct hsFunctionDispatcher {
+    hsFunctionDispatcher(func_ptr fpu, func_ptr sse1=0, func_ptr sse2=0, func_ptr sse3=0, func_ptr ssse3=0, func_ptr sse41=0, func_ptr sse42=0, func_ptr avx=0) {
+        hsAssert(fpu, "FPU fallback function required.");
+        const hsCpuId& cpu = hsCpuId::instance();
+#ifdef HS_AVX
+        if (cpu.has_avx && avx) {
+            call = avx;
+        } else
+#endif
+#ifdef HS_SSE42
+        if (cpu.has_sse42 && sse42) {
+            call = sse42;
+        } else
+#endif
+#ifdef HS_SSE41
+        if (cpu.has_sse41 && sse41) {
+            call = sse41;
+        } else
+#endif
+#ifdef HS_SSSE3
+        if (cpu.has_ssse3 && ssse3) {
+            call = ssse3;
+        } else
+#endif
+#ifdef HS_SSE3
+        if (cpu.has_sse3 && sse3) {
+            call = sse3;
+        } else
+#endif
+#ifdef HS_SSE2
+        if (cpu.has_sse2 && sse2) {
+            call = sse2;
+        } else
+#endif
+#ifdef HS_SSE1
+        if (cpu.has_sse1 && sse1) {
+            call = sse1;
+        } else
+#endif
+        {
+            call = fpu;
+        }
+    };
+    func_ptr call;
+};
+
+
+#endif  // hsCpuID_inc
\ No newline at end of file

From 304d15acfee677d4464b33abb7e59854bfb081c1 Mon Sep 17 00:00:00 2001
From: Joseph Davies <deledrius@gmail.com>
Date: Sun, 22 Apr 2012 14:51:39 -0700
Subject: [PATCH 2/4] Update hsMatrix44 to use hsCpuID dispatcher.

---
 Sources/Plasma/CoreLib/hsMatrix44.cpp | 85 ++++++++++++++++-----------
 Sources/Plasma/CoreLib/hsMatrix44.h   | 11 +++-
 2 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/Sources/Plasma/CoreLib/hsMatrix44.cpp b/Sources/Plasma/CoreLib/hsMatrix44.cpp
index 9c4053f3..75bb0883 100644
--- a/Sources/Plasma/CoreLib/hsMatrix44.cpp
+++ b/Sources/Plasma/CoreLib/hsMatrix44.cpp
@@ -47,13 +47,16 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 #include "hsStream.h"
 #include <math.h>
 
-#ifdef HAVE_SSE
-#   include <smmintrin.h>
+#ifdef HS_SIMD_INCLUDE
+#  include HS_SIMD_INCLUDE
 #endif
 
 static hsMatrix44 myIdent = hsMatrix44().Reset();
 const hsMatrix44& hsMatrix44::IdentityMatrix() { return myIdent; }
 
+// CPU-optimized functions requiring dispatch
+hsFunctionDispatcher<hsMatrix44::mat_mult_ptr> hsMatrix44::mat_mult(hsMatrix44::mat_mult_fpu, 0, 0, hsMatrix44::mat_mult_sse3);
+
 /*
     For the rotation:
          Ś        2     2                                      Ś
@@ -96,9 +99,47 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
     rotate.QuatFromMatrix44(*this);
 }
 
-#ifdef HAVE_SSE
+hsMatrix44 hsMatrix44::mat_mult_fpu(const hsMatrix44 &a, const hsMatrix44 &b)
+{
+    hsMatrix44 c;
+
+    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
+    {
+        c.Reset();
+        return c;
+    }
+
+    if( a.fFlags & hsMatrix44::kIsIdent )
+        return b;
+    if( b.fFlags & hsMatrix44::kIsIdent )
+        return a;
+
+    c.fMap[0][0] = (a.fMap[0][0] * b.fMap[0][0]) + (a.fMap[0][1] * b.fMap[1][0]) + (a.fMap[0][2] * b.fMap[2][0]) + (a.fMap[0][3] * b.fMap[3][0]);
+    c.fMap[0][1] = (a.fMap[0][0] * b.fMap[0][1]) + (a.fMap[0][1] * b.fMap[1][1]) + (a.fMap[0][2] * b.fMap[2][1]) + (a.fMap[0][3] * b.fMap[3][1]);
+    c.fMap[0][2] = (a.fMap[0][0] * b.fMap[0][2]) + (a.fMap[0][1] * b.fMap[1][2]) + (a.fMap[0][2] * b.fMap[2][2]) + (a.fMap[0][3] * b.fMap[3][2]);
+    c.fMap[0][3] = (a.fMap[0][0] * b.fMap[0][3]) + (a.fMap[0][1] * b.fMap[1][3]) + (a.fMap[0][2] * b.fMap[2][3]) + (a.fMap[0][3] * b.fMap[3][3]);
+
+    c.fMap[1][0] = (a.fMap[1][0] * b.fMap[0][0]) + (a.fMap[1][1] * b.fMap[1][0]) + (a.fMap[1][2] * b.fMap[2][0]) + (a.fMap[1][3] * b.fMap[3][0]);
+    c.fMap[1][1] = (a.fMap[1][0] * b.fMap[0][1]) + (a.fMap[1][1] * b.fMap[1][1]) + (a.fMap[1][2] * b.fMap[2][1]) + (a.fMap[1][3] * b.fMap[3][1]);
+    c.fMap[1][2] = (a.fMap[1][0] * b.fMap[0][2]) + (a.fMap[1][1] * b.fMap[1][2]) + (a.fMap[1][2] * b.fMap[2][2]) + (a.fMap[1][3] * b.fMap[3][2]);
+    c.fMap[1][3] = (a.fMap[1][0] * b.fMap[0][3]) + (a.fMap[1][1] * b.fMap[1][3]) + (a.fMap[1][2] * b.fMap[2][3]) + (a.fMap[1][3] * b.fMap[3][3]);
+
+    c.fMap[2][0] = (a.fMap[2][0] * b.fMap[0][0]) + (a.fMap[2][1] * b.fMap[1][0]) + (a.fMap[2][2] * b.fMap[2][0]) + (a.fMap[2][3] * b.fMap[3][0]);
+    c.fMap[2][1] = (a.fMap[2][0] * b.fMap[0][1]) + (a.fMap[2][1] * b.fMap[1][1]) + (a.fMap[2][2] * b.fMap[2][1]) + (a.fMap[2][3] * b.fMap[3][1]);
+    c.fMap[2][2] = (a.fMap[2][0] * b.fMap[0][2]) + (a.fMap[2][1] * b.fMap[1][2]) + (a.fMap[2][2] * b.fMap[2][2]) + (a.fMap[2][3] * b.fMap[3][2]);
+    c.fMap[2][3] = (a.fMap[2][0] * b.fMap[0][3]) + (a.fMap[2][1] * b.fMap[1][3]) + (a.fMap[2][2] * b.fMap[2][3]) + (a.fMap[2][3] * b.fMap[3][3]);
+
+    c.fMap[3][0] = (a.fMap[3][0] * b.fMap[0][0]) + (a.fMap[3][1] * b.fMap[1][0]) + (a.fMap[3][2] * b.fMap[2][0]) + (a.fMap[3][3] * b.fMap[3][0]);
+    c.fMap[3][1] = (a.fMap[3][0] * b.fMap[0][1]) + (a.fMap[3][1] * b.fMap[1][1]) + (a.fMap[3][2] * b.fMap[2][1]) + (a.fMap[3][3] * b.fMap[3][1]);
+    c.fMap[3][2] = (a.fMap[3][0] * b.fMap[0][2]) + (a.fMap[3][1] * b.fMap[1][2]) + (a.fMap[3][2] * b.fMap[2][2]) + (a.fMap[3][3] * b.fMap[3][2]);
+    c.fMap[3][3] = (a.fMap[3][0] * b.fMap[0][3]) + (a.fMap[3][1] * b.fMap[1][3]) + (a.fMap[3][2] * b.fMap[2][3]) + (a.fMap[3][3] * b.fMap[3][3]);
+
+    return c;
+}
+
+#ifdef HS_SSE3
 #   define MULTBEGIN(i) \
-        xmm[0]   = _mm_loadu_ps(fMap[i]);
+        xmm[0]   = _mm_loadu_ps(a.fMap[i]);
 #   define MULTCELL(i, j) \
         xmm[1]   = _mm_set_ps(b.fMap[3][j], b.fMap[2][j], b.fMap[1][j], b.fMap[0][j]); \
         xmm[j+2] = _mm_mul_ps(xmm[0], xmm[1]);
@@ -107,24 +148,23 @@ void hsMatrix44::DecompRigid(hsScalarTriple &translate, hsQuat &rotate) const
         xmm[7] = _mm_hadd_ps(xmm[4], xmm[5]); \
         xmm[1] = _mm_hadd_ps(xmm[6], xmm[7]); \
         _mm_storeu_ps(c.fMap[i], xmm[1]);
-#endif
+#endif  // HS_SSE3
 
-hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
+hsMatrix44 hsMatrix44::mat_mult_sse3(const hsMatrix44 &a, const hsMatrix44 &b)
 {
     hsMatrix44 c;
-
-    if( fFlags & b.fFlags & hsMatrix44::kIsIdent )
+#ifdef HS_SSE3
+    if( a.fFlags & b.fFlags & hsMatrix44::kIsIdent )
     {
         c.Reset();
         return c;
     }
 
-    if( fFlags & hsMatrix44::kIsIdent )
+    if( a.fFlags & hsMatrix44::kIsIdent )
         return b;
     if( b.fFlags & hsMatrix44::kIsIdent )
-        return *this;
+        return a;
 
-#ifdef HAVE_SSE
     __m128 xmm[8];
 
     MULTBEGIN(0);
@@ -154,28 +194,7 @@ hsMatrix44 hsMatrix44::operator*(const hsMatrix44& b) const
     MULTCELL(3, 2);
     MULTCELL(3, 3);
     MULTFINISH(3);
-#else
-    c.fMap[0][0] = (fMap[0][0] * b.fMap[0][0]) + (fMap[0][1] * b.fMap[1][0]) + (fMap[0][2] * b.fMap[2][0]) + (fMap[0][3] * b.fMap[3][0]);
-    c.fMap[0][1] = (fMap[0][0] * b.fMap[0][1]) + (fMap[0][1] * b.fMap[1][1]) + (fMap[0][2] * b.fMap[2][1]) + (fMap[0][3] * b.fMap[3][1]);
-    c.fMap[0][2] = (fMap[0][0] * b.fMap[0][2]) + (fMap[0][1] * b.fMap[1][2]) + (fMap[0][2] * b.fMap[2][2]) + (fMap[0][3] * b.fMap[3][2]);
-    c.fMap[0][3] = (fMap[0][0] * b.fMap[0][3]) + (fMap[0][1] * b.fMap[1][3]) + (fMap[0][2] * b.fMap[2][3]) + (fMap[0][3] * b.fMap[3][3]);
-
-    c.fMap[1][0] = (fMap[1][0] * b.fMap[0][0]) + (fMap[1][1] * b.fMap[1][0]) + (fMap[1][2] * b.fMap[2][0]) + (fMap[1][3] * b.fMap[3][0]);
-    c.fMap[1][1] = (fMap[1][0] * b.fMap[0][1]) + (fMap[1][1] * b.fMap[1][1]) + (fMap[1][2] * b.fMap[2][1]) + (fMap[1][3] * b.fMap[3][1]);
-    c.fMap[1][2] = (fMap[1][0] * b.fMap[0][2]) + (fMap[1][1] * b.fMap[1][2]) + (fMap[1][2] * b.fMap[2][2]) + (fMap[1][3] * b.fMap[3][2]);
-    c.fMap[1][3] = (fMap[1][0] * b.fMap[0][3]) + (fMap[1][1] * b.fMap[1][3]) + (fMap[1][2] * b.fMap[2][3]) + (fMap[1][3] * b.fMap[3][3]);
-    
-    c.fMap[2][0] = (fMap[2][0] * b.fMap[0][0]) + (fMap[2][1] * b.fMap[1][0]) + (fMap[2][2] * b.fMap[2][0]) + (fMap[2][3] * b.fMap[3][0]);
-    c.fMap[2][1] = (fMap[2][0] * b.fMap[0][1]) + (fMap[2][1] * b.fMap[1][1]) + (fMap[2][2] * b.fMap[2][1]) + (fMap[2][3] * b.fMap[3][1]);
-    c.fMap[2][2] = (fMap[2][0] * b.fMap[0][2]) + (fMap[2][1] * b.fMap[1][2]) + (fMap[2][2] * b.fMap[2][2]) + (fMap[2][3] * b.fMap[3][2]);
-    c.fMap[2][3] = (fMap[2][0] * b.fMap[0][3]) + (fMap[2][1] * b.fMap[1][3]) + (fMap[2][2] * b.fMap[2][3]) + (fMap[2][3] * b.fMap[3][3]);
-    
-    c.fMap[3][0] = (fMap[3][0] * b.fMap[0][0]) + (fMap[3][1] * b.fMap[1][0]) + (fMap[3][2] * b.fMap[2][0]) + (fMap[3][3] * b.fMap[3][0]);
-    c.fMap[3][1] = (fMap[3][0] * b.fMap[0][1]) + (fMap[3][1] * b.fMap[1][1]) + (fMap[3][2] * b.fMap[2][1]) + (fMap[3][3] * b.fMap[3][1]);
-    c.fMap[3][2] = (fMap[3][0] * b.fMap[0][2]) + (fMap[3][1] * b.fMap[1][2]) + (fMap[3][2] * b.fMap[2][2]) + (fMap[3][3] * b.fMap[3][2]);
-    c.fMap[3][3] = (fMap[3][0] * b.fMap[0][3]) + (fMap[3][1] * b.fMap[1][3]) + (fMap[3][2] * b.fMap[2][3]) + (fMap[3][3] * b.fMap[3][3]);
-#endif
-
+#endif  // HS_SSE3
     return c;
 }
 
diff --git a/Sources/Plasma/CoreLib/hsMatrix44.h b/Sources/Plasma/CoreLib/hsMatrix44.h
index 7804631d..c0a5ed21 100644
--- a/Sources/Plasma/CoreLib/hsMatrix44.h
+++ b/Sources/Plasma/CoreLib/hsMatrix44.h
@@ -44,6 +44,7 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 
 #include "HeadSpin.h"
 #include "hsGeometry3.h"
+#include "hsCpuID.h"
 
 class hsQuat;
 
@@ -104,7 +105,7 @@ struct hsMatrix44 {
                         const hsVector3* up);
 
     hsBool          GetParity() const;
-    float        GetDeterminant() const;
+    float           GetDeterminant() const;
     hsMatrix44*     GetInverse(hsMatrix44* inverse) const;
     hsMatrix44*     GetTranspose(hsMatrix44* inverse) const;
     hsMatrix44*     GetAdjoint(hsMatrix44* adjoint) const;
@@ -140,7 +141,7 @@ struct hsMatrix44 {
                         return rVal;
                     }
     hsVector3 operator*(const hsVector3& p) const;
-    hsMatrix44  operator*(const hsMatrix44& b) const;
+    hsMatrix44 operator *(const hsMatrix44& other) const { return mat_mult.call(*this, other); }
     
     hsPoint3*           MapPoints(long count, hsPoint3 points[]) const;
     
@@ -152,6 +153,12 @@ struct hsMatrix44 {
 
     void Read(hsStream *stream);
     void Write(hsStream *stream);
+
+    //  CPU-optimized functions
+    typedef hsMatrix44(*mat_mult_ptr)(const hsMatrix44&, const hsMatrix44&);
+    static hsMatrix44 mat_mult_fpu(const hsMatrix44&, const hsMatrix44&);
+    static hsMatrix44 mat_mult_sse3(const hsMatrix44&, const hsMatrix44&);
+    static hsFunctionDispatcher<mat_mult_ptr> mat_mult;
 };
 
 ////////////////////////////////////////////////////////////////////////////

From 9ee5c4d040217ea3e6405930c4f85992364de490 Mon Sep 17 00:00:00 2001
From: Joseph Davies <deledrius@gmail.com>
Date: Sun, 22 Apr 2012 15:21:28 -0700
Subject: [PATCH 3/4] Remove remaining pre-hsCpuID SSE special-casing.

---
 CMakeLists.txt                                |  6 ----
 Sources/Plasma/Apps/plClient/winmain.cpp      | 28 ----------------
 .../pnSceneObject/plCoordinateInterface.cpp   |  7 ----
 .../PubUtilLib/plDrawable/plDrawableSpans.cpp |  8 +----
 .../PubUtilLib/plPipeline/plDXPipeline.cpp    | 32 ++++++++-----------
 5 files changed, 15 insertions(+), 66 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c042e895..11abb4c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,12 +84,6 @@ if(MSVC)
     add_definitions(-D_SCL_SECURE_NO_WARNINGS)
 endif(MSVC)
 
-# TODO: Maybe some kind of automated test here?
-option(PLASMA_USE_SSE "Enable SSE optimizations?" ON)
-if(PLASMA_USE_SSE)
-    add_definitions(-DHAVE_SSE)
-endif(PLASMA_USE_SSE)
-
 #TODO: Make the OpenSSL includes less promiscuous so this isn't needed
 include_directories(${OPENSSL_INCLUDE_DIR})
 
diff --git a/Sources/Plasma/Apps/plClient/winmain.cpp b/Sources/Plasma/Apps/plClient/winmain.cpp
index 9870afde..985a73f0 100644
--- a/Sources/Plasma/Apps/plClient/winmain.cpp
+++ b/Sources/Plasma/Apps/plClient/winmain.cpp
@@ -49,10 +49,6 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
     #include <dmdfm.h>      // Windows Load EXE into memory suff
 #endif
 
-#ifdef HAVE_SSE
-#   include <intrin.h>
-#endif
-
 #include <curl/curl.h>
 
 #include "HeadSpin.h"
@@ -1388,35 +1384,11 @@ LONG WINAPI plCustomUnhandledExceptionFilter( struct _EXCEPTION_POINTERS *Except
 }
 #endif
 
-bool CheckCPU()
-{
-    const unsigned int sse3_flag = 0x00000001;
-    // (any other CPU features...)
-
-    int cpu_info[4];
-    __cpuid(cpu_info, 1);
-#ifdef HAVE_SSE
-    if((cpu_info[2] & sse3_flag) == 0)
-        return false;
-#endif
-    // Insert additional feature checks here
-
-    return true;
-}
-
 #include "pfConsoleCore/pfConsoleEngine.h"
 PF_CONSOLE_LINK_ALL()
 
 int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR lpCmdLine, int nCmdShow)
 {
-    // Check to make sure we have a good CPU before getting started
-    if (!CheckCPU())
-    {
-        plString msg = plString::Format("Your processor does not support all of the features required to play %S.", ProductLongName());
-        hsMessageBox(msg.c_str(), "Error", hsMessageBoxNormal, hsMessageBoxIconError);
-        return PARABLE_NORMAL_EXIT;
-    }
-
     PF_CONSOLE_INIT_ALL()
 
     // Set global handle
diff --git a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
index 1c64314d..878acdc0 100644
--- a/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
+++ b/Sources/Plasma/NucleusLib/pnSceneObject/plCoordinateInterface.cpp
@@ -380,7 +380,6 @@ plProfile_CreateTimer("   CIRecalcT", "Object", CIRecalcT);
 plProfile_CreateTimer("   CIDirtyT", "Object", CIDirtyT);
 plProfile_CreateTimer("   CISetT", "Object", CISetT);
 
-#ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
     hsMatrix44 ret;
@@ -441,7 +440,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
 
     return ret;
 }
-#endif // HAVE_SSE
 
 void plCoordinateInterface::IRecalcTransforms()
 {
@@ -449,13 +447,8 @@ void plCoordinateInterface::IRecalcTransforms()
     plProfile_BeginTiming(CIRecalcT);
     if( fParent )
     {
-#ifdef HAVE_SSE
-        fLocalToWorld = fParent->GetLocalToWorld() * fLocalToParent;
-        fWorldToLocal = fParentToLocal * fParent->GetWorldToLocal();
-#else
         fLocalToWorld = IMatrixMul34(fParent->GetLocalToWorld(), fLocalToParent);
         fWorldToLocal = IMatrixMul34(fParentToLocal, fParent->GetWorldToLocal());
-#endif
     }
     else
     {
diff --git a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
index 3fc19b2a..369dd94e 100644
--- a/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
+++ b/Sources/Plasma/PubUtilLib/plDrawable/plDrawableSpans.cpp
@@ -416,7 +416,6 @@ hsBool plDrawableSpans::IBoundsInvalid(const hsBounds3Ext& bnd) const
 }
 
 //// SetTransform ////////////////////////////////////////////////////////////
-#ifndef HAVE_SSE
 static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& rhs)
 {
     hsMatrix44 ret;
@@ -477,7 +476,6 @@ static inline hsMatrix44 IMatrixMul34(const hsMatrix44& lhs, const hsMatrix44& r
 
     return ret;
 }
-#endif
 
 #ifdef MF_TEST_UPDATE
 plProfile_CreateCounter("DSSetTrans", "Update", DSSetTrans);
@@ -521,13 +519,9 @@ plDrawable& plDrawableSpans::SetTransform( uint32_t index, const hsMatrix44& l2w
 #endif // MF_TEST_UPDATE
             for( i = 0; i < spans->GetCount(); i++ )
             {
-#ifdef HAVE_SSE
-                fLocalToWorlds[ (*spans)[ i ] ] = l2w * fLocalToBones[ (*spans)[ i ] ];
-                fWorldToLocals[ (*spans)[ i ] ] = fBoneToLocals[ (*spans)[ i ] ] * w2l;
-#else
                 fLocalToWorlds[ (*spans)[ i ] ] = IMatrixMul34(l2w, fLocalToBones[ (*spans)[ i ] ]);
                 fWorldToLocals[ (*spans)[ i ] ] = IMatrixMul34(fBoneToLocals[ (*spans)[ i ] ], w2l);
-#endif // HAVE_SSE
+
             }
 #ifdef MF_TEST_UPDATE
             plProfile_EndTiming(DSMatTransT);
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index 6c93dbbd..413e98db 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -163,8 +163,8 @@ You can contact Cyan Worlds, Inc. by email legal@cyan.com
 
 #include <algorithm>
 
-#ifdef HAVE_SSE
-#   include <smmintrin.h>
+#ifdef HS_SIMD_INCLUDE
+#  include HS_SIMD_INCLUDE
 #endif
 
 //#define MF_TOSSER
@@ -10527,9 +10527,9 @@ void plDXPipeline::LoadResources()
 
 // Sorry about this, but it really did speed up the skinning.
 // Just some macros for the inner loop of IBlendVertsIntoBuffer.
-#ifdef HAVE_SSE
+#ifdef HS_SSE3
 #   define MATRIXMULTBEGIN(xfm, wgt) \
-        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf; \
+        __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
         ALIGN(16) float hack[4]; \
         mc0 = _mm_loadu_ps(xfm.fMap[0]); \
         mc1 = _mm_loadu_ps(xfm.fMap[1]); \
@@ -10541,30 +10541,26 @@ void plDXPipeline::LoadResources()
         _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
         _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
         \
-        hbuf = _mm_hadd_ps(_x, _y); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
+        hbuf1 = _mm_hadd_ps(_x, _y); \
+        hbuf2 = _mm_hadd_ps(_z, _z); \
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
+        _mm_store_ps(hack, hbuf1); \
         dst.fX += hack[0]; \
         dst.fY += hack[1]; \
-        hbuf = _mm_hadd_ps(_z, _z); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fZ += hack[0];
+        dst.fZ += hack[2];
 #   define MATRIXMULTVECTORADD(dst, src) \
         msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
         _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
         _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
         _z  = _mm_mul_ps(_mm_mul_ps(mc2, msr), mwt); \
         \
-        hbuf = _mm_hadd_ps(_x, _y); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
+        hbuf1 = _mm_hadd_ps(_x, _y); \
+        hbuf2 = _mm_hadd_ps(_z, _z); \
+        hbuf1 = _mm_hadd_ps(hbuf1, hbuf2); \
+        _mm_store_ps(hack, hbuf1); \
         dst.fX += hack[0]; \
         dst.fY += hack[1]; \
-        hbuf = _mm_hadd_ps(_z, _z); \
-        hbuf = _mm_hadd_ps(hbuf, hbuf); \
-        _mm_store_ps(hack, hbuf); \
-        dst.fZ += hack[0];
+        dst.fZ += hack[2];
 #else
 #   define MATRIXMULTBEGIN(xfm, wgt) \
         float m00 = xfm.fMap[0][0]; \

From 072bf3570ca955d29b677f13f1adc64af434f9b6 Mon Sep 17 00:00:00 2001
From: Joseph Davies <deledrius@gmail.com>
Date: Mon, 23 Apr 2012 22:15:56 -0700
Subject: [PATCH 4/4] Fix support in plDXPipeline for SSE using temporary
 macros.

Re-enables FPU/SSE3 code using the FunctionDispatcher and some quick
hacky macros to template out the two nearly-identical functions,
awaiting branan's deep-voodoo template-specialization functor-dispatcher
patch.
---
 .../PubUtilLib/plPipeline/plDXPipeline.cpp    | 491 +++++++++---------
 .../PubUtilLib/plPipeline/plDXPipeline.h      |  12 +-
 2 files changed, 267 insertions(+), 236 deletions(-)

diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
index 413e98db..9bc0373f 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.cpp
@@ -10525,17 +10525,76 @@ void plDXPipeline::LoadResources()
     plNetClientApp::StaticDebugMsg("End Device Reload");
 }
 
-// Sorry about this, but it really did speed up the skinning.
-// Just some macros for the inner loop of IBlendVertsIntoBuffer.
+// inlTESTPOINT /////////////////////////////////////////
+// Update mins and maxs if destP is outside.
+inline void inlTESTPOINT(const hsPoint3& destP,
+                         float& minX, float& minY, float& minZ,
+                         float& maxX, float& maxY, float& maxZ)
+{
+    if( destP.fX < minX )
+        minX = destP.fX;
+    else if( destP.fX > maxX )
+        maxX = destP.fX;
+
+    if( destP.fY < minY )
+        minY = destP.fY;
+    else if( destP.fY > maxY )
+        maxY = destP.fY;
+
+    if( destP.fZ < minZ )
+        minZ = destP.fZ;
+    else if( destP.fZ > maxZ )
+        maxZ = destP.fZ;
+}
+
+//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
+//  Given a pointer into a buffer of verts that have blending data in the D3D
+//  format, blends them into the destination buffer given without the blending
+//  info.
+
+// FPU version
+#define MATRIXMULTBEGIN_FPU(xfm, wgt) \
+        float m00 = xfm.fMap[0][0]; \
+        float m01 = xfm.fMap[0][1]; \
+        float m02 = xfm.fMap[0][2]; \
+        float m03 = xfm.fMap[0][3]; \
+        float m10 = xfm.fMap[1][0]; \
+        float m11 = xfm.fMap[1][1]; \
+        float m12 = xfm.fMap[1][2]; \
+        float m13 = xfm.fMap[1][3]; \
+        float m20 = xfm.fMap[2][0]; \
+        float m21 = xfm.fMap[2][1]; \
+        float m22 = xfm.fMap[2][2]; \
+        float m23 = xfm.fMap[2][3]; \
+        float m_wgt = wgt; \
+        float srcX, srcY, srcZ;
+#define MATRIXMULTPOINTADD_FPU(dst, src) \
+        srcX = src.fX; \
+        srcY = src.fY; \
+        srcZ = src.fZ; \
+        \
+        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
+        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
+        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
+#define MATRIXMULTVECTORADD_FPU(dst, src) \
+        srcX = src.fX; \
+        srcY = src.fY; \
+        srcZ = src.fZ; \
+        \
+        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
+        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
+        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
+
+// SSE3 version
 #ifdef HS_SSE3
-#   define MATRIXMULTBEGIN(xfm, wgt) \
+#define MATRIXMULTBEGIN_SSE3(xfm, wgt) \
         __m128 mc0, mc1, mc2, mwt, msr, _x, _y, _z, hbuf1, hbuf2; \
         ALIGN(16) float hack[4]; \
         mc0 = _mm_loadu_ps(xfm.fMap[0]); \
         mc1 = _mm_loadu_ps(xfm.fMap[1]); \
         mc2 = _mm_loadu_ps(xfm.fMap[2]); \
         mwt = _mm_set_ps1(wgt);
-#   define MATRIXMULTPOINTADD(dst, src) \
+#define MATRIXMULTPOINTADD_SSE3(dst, src) \
         msr = _mm_set_ps(1.f, src.fZ, src.fY, src.fX); \
         _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
         _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
@@ -10548,7 +10607,7 @@ void plDXPipeline::LoadResources()
         dst.fX += hack[0]; \
         dst.fY += hack[1]; \
         dst.fZ += hack[2];
-#   define MATRIXMULTVECTORADD(dst, src) \
+#define MATRIXMULTVECTORADD_SSE3(dst, src) \
         msr = _mm_set_ps(0.f, src.fZ, src.fY, src.fX); \
         _x  = _mm_mul_ps(_mm_mul_ps(mc0, msr), mwt); \
         _y  = _mm_mul_ps(_mm_mul_ps(mc1, msr), mwt); \
@@ -10561,250 +10620,214 @@ void plDXPipeline::LoadResources()
         dst.fX += hack[0]; \
         dst.fY += hack[1]; \
         dst.fZ += hack[2];
-#else
-#   define MATRIXMULTBEGIN(xfm, wgt) \
-        float m00 = xfm.fMap[0][0]; \
-        float m01 = xfm.fMap[0][1]; \
-        float m02 = xfm.fMap[0][2]; \
-        float m03 = xfm.fMap[0][3]; \
-        float m10 = xfm.fMap[1][0]; \
-        float m11 = xfm.fMap[1][1]; \
-        float m12 = xfm.fMap[1][2]; \
-        float m13 = xfm.fMap[1][3]; \
-        float m20 = xfm.fMap[2][0]; \
-        float m21 = xfm.fMap[2][1]; \
-        float m22 = xfm.fMap[2][2]; \
-        float m23 = xfm.fMap[2][3]; \
-        float m_wgt = wgt; \
-        float srcX, srcY, srcZ;
-#   define MATRIXMULTPOINTADD(dst, src) \
-        srcX = src.fX; \
-        srcY = src.fY; \
-        srcZ = src.fZ; \
-        \
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02 + m03) * m_wgt; \
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12 + m13) * m_wgt; \
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22 + m23) * m_wgt;
-#   define MATRIXMULTVECTORADD(dst, src) \
-        srcX = src.fX; \
-        srcY = src.fY; \
-        srcZ = src.fZ; \
-        \
-        dst.fX += (srcX * m00 + srcY * m01 + srcZ * m02) * m_wgt; \
-        dst.fY += (srcX * m10 + srcY * m11 + srcZ * m12) * m_wgt; \
-        dst.fZ += (srcX * m20 + srcY * m21 + srcZ * m22) * m_wgt;
-#endif // HAVE_SSE
-
-// inlTESTPOINT /////////////////////////////////////////
-// Update mins and maxs if destP is outside.
-inline void inlTESTPOINT(const hsPoint3& destP, 
-                         float& minX, float& minY, float& minZ, 
-                         float& maxX, float& maxY, float& maxZ)
-{
-    if( destP.fX < minX )
-        minX = destP.fX;
-    else if( destP.fX > maxX )
-        maxX = destP.fX;
-
-    if( destP.fY < minY )
-        minY = destP.fY;
-    else if( destP.fY > maxY )
-        maxY = destP.fY;
-
-    if( destP.fZ < minZ )
-        minZ = destP.fZ;
-    else if( destP.fZ > maxZ )
-        maxZ = destP.fZ;
-}
-
-//// IBlendVertsIntoBuffer ////////////////////////////////////////////////////
-//  Given a pointer into a buffer of verts that have blending data in the D3D
-//  format, blends them into the destination buffer given without the blending
-//  info.
-
-void    plDXPipeline::IBlendVertsIntoBuffer( plSpan* span, 
-                                              hsMatrix44* matrixPalette, int numMatrices,
-                                              const uint8_t *src, uint8_t format, uint32_t srcStride, 
-                                              uint8_t *dest, uint32_t destStride, uint32_t count,
-                                              uint16_t localUVWChans )
-{
-    uint8_t       numUVs, numWeights;
-    uint32_t      i, j, indices, color, specColor, uvChanSize;
-    float       weights[ 4 ], weightSum;
-    hsPoint3    pt, tempPt, destPt;
-    hsVector3   vec, tempNorm, destNorm;
-
-
-    /// Get some counts
-    switch( format & plGBufferGroup::kSkinWeightMask )
-    {
-        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break;
-        case plGBufferGroup::kSkin2Weights: numWeights = 2; break;
-        case plGBufferGroup::kSkin3Weights: numWeights = 3; break;
-        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" );
-    }
-
-    numUVs = plGBufferGroup::CalcNumUVs( format );
-    uvChanSize = numUVs * sizeof( float ) * 3;
-
-//#define MF_RECALC_BOUNDS
-#ifdef MF_RECALC_BOUNDS
-    float minX = 1.e33f;
-    float minY = 1.e33f;
-    float minZ = 1.e33f;
-
-    float maxX = -1.e33f;
-    float maxY = -1.e33f;
-    float maxZ = -1.e33f;
-#endif // MF_RECALC_BOUNDS
+#endif
 
-    // localUVWChans is bump mapping tangent space vectors, which need to
+// CPU-optimized functions requiring dispatch
+hsFunctionDispatcher<plDXPipeline::blend_vert_buffer_ptr> plDXPipeline::blend_vert_buffer(plDXPipeline::blend_vert_buffer_fpu, 0, 0, plDXPipeline::blend_vert_buffer_sse3);
+
+// Temporary macros for IBlendVertsIntoBuffer dispatch code de-duplication
+#define BLENDVERTSTART \
+    uint8_t     numUVs, numWeights; \
+    uint32_t    i, j, indices, color, specColor, uvChanSize; \
+    float       weights[ 4 ], weightSum; \
+    hsPoint3    pt, tempPt, destPt; \
+    hsVector3   vec, tempNorm, destNorm; \
+    \
+    /* Get some counts */\
+    switch( format & plGBufferGroup::kSkinWeightMask ) \
+    { \
+        case plGBufferGroup::kSkin1Weight:  numWeights = 1; break; \
+        case plGBufferGroup::kSkin2Weights: numWeights = 2; break; \
+        case plGBufferGroup::kSkin3Weights: numWeights = 3; break; \
+        default: hsAssert( false, "Invalid weight count in IBlendVertsIntoBuffer()" ); \
+    } \
+    \
+    numUVs = plGBufferGroup::CalcNumUVs( format ); \
+    uvChanSize = numUVs * sizeof( float ) * 3; \
+    \
+    /* localUVWChans is bump mapping tangent space vectors, which need to
     // be skinned like the normal, as opposed to passed through like 
     // garden variety UVW coordinates.
     // There are no localUVWChans that I know of in production assets (i.e.
-    // the avatar is not skinned).
-    if( !localUVWChans )
-    {
-        /// Copy whilst blending
-        for( i = 0; i < count; i++ )
-        {
-            // Extract data
-            src = inlExtractPoint( src, pt );
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
-            {
-                src = inlExtractFloat( src, weights[ j ] );
-                weightSum += weights[ j ];
-            }
-            weights[ j ] = 1 - weightSum;
-
-            if( format & plGBufferGroup::kSkinIndices )
-            {
-                src = inlExtractUInt32( src, indices );
-            }
-            else
-            {
-                indices = 1 << 8;
-            }
-            src = inlExtractPoint( src, vec );
-            src = inlExtractUInt32( src, color );
-            src = inlExtractUInt32( src, specColor );
-
-            // Blend
-            destPt.Set( 0, 0, 0 );
-            destNorm.Set( 0, 0, 0 );
-            for( j = 0; j < numWeights + 1; j++ )
-            {
-                if( weights[ j ] )
+    // the avatar is not skinned).*/\
+    if( !localUVWChans ) \
+    { \
+        /* Copy whilst blending */\
+        for( i = 0; i < count; i++ ) \
+        { \
+            /* Extract data */\
+            src = inlExtractPoint( src, pt ); \
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
+            { \
+                src = inlExtractFloat( src, weights[ j ] ); \
+                weightSum += weights[ j ]; \
+            } \
+            weights[ j ] = 1 - weightSum; \
+            \
+            if( format & plGBufferGroup::kSkinIndices ) \
+            { \
+                src = inlExtractUInt32( src, indices ); \
+            } \
+            else \
+            { \
+                indices = 1 << 8; \
+            } \
+            src = inlExtractPoint( src, vec ); \
+            src = inlExtractUInt32( src, color ); \
+            src = inlExtractUInt32( src, specColor ); \
+            \
+            /* Blend */\
+            destPt.Set( 0, 0, 0 ); \
+            destNorm.Set( 0, 0, 0 ); \
+            for( j = 0; j < numWeights + 1; j++ ) \
+            { \
+                if( weights[ j ] ) \
                 {
+                    /*
                     MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
 
                     MATRIXMULTPOINTADD(destPt, pt);
                     MATRIXMULTVECTORADD(destNorm, vec);
-                }
-
-                indices >>= 8;
-            }
-            // Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth".
-//          hsFastMath::NormalizeAppr(destNorm);
-
-#ifdef MF_RECALC_BOUNDS
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
-#endif // MF_RECALC_BOUNDS
-
-            // Slam data into position now
-            dest = inlStuffPoint( dest, destPt );
-            dest = inlStuffPoint( dest, destNorm );
-            dest = inlStuffUInt32( dest, color );
-            dest = inlStuffUInt32( dest, specColor );
-            memcpy( dest, src, uvChanSize );
-            src += uvChanSize;
-            dest += uvChanSize;
-        }
-    }
-    else 
-    {
-        uint8_t hiChan = localUVWChans >> 8;
-        uint8_t loChan = localUVWChans & 0xff;
-        /// Copy whilst blending
-        for( i = 0; i < count; i++ )
-        {
-            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels];
-            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels];
-
-            // Extract data
-            src = inlExtractPoint( src, pt );
-            for( j = 0, weightSum = 0; j < numWeights; j++ )
-            {
-                src = inlExtractFloat( src, weights[ j ] );
-                weightSum += weights[ j ];
-            }
-            weights[ j ] = 1 - weightSum;
-
-            if( format & plGBufferGroup::kSkinIndices )
-            {
-                src = inlExtractUInt32( src, indices );
-            }
-            else
-            {
-                indices = 1 << 8;
-            }
-
-            src = inlExtractPoint( src, vec );
-            src = inlExtractUInt32( src, color );
-            src = inlExtractUInt32( src, specColor );
-
-            uint8_t k;
-            for( k = 0; k < numUVs; k++ )
-            {
-                src = inlExtractPoint( src, srcUVWs[k] );
-            }
-            memcpy( dstUVWs, srcUVWs, uvChanSize);
-            dstUVWs[loChan].Set(0,0,0);
-            dstUVWs[hiChan].Set(0,0,0);
-
-            // Blend
-            destPt.Set( 0, 0, 0 );
-            destNorm.Set( 0, 0, 0 );
-            for( j = 0; j < numWeights + 1; j++ )
-            {
-                if( weights[ j ] )
-                {
+                    */
+#define BLENDVERTMID \
+                } \
+                \
+                indices >>= 8; \
+            } \
+            /* Probably don't really need to renormalize this. There errors are
+            // going to be subtle and "smooth".*/\
+            /* hsFastMath::NormalizeAppr(destNorm);*/ \
+            \
+            /* Slam data into position now */\
+            dest = inlStuffPoint( dest, destPt ); \
+            dest = inlStuffPoint( dest, destNorm ); \
+            dest = inlStuffUInt32( dest, color ); \
+            dest = inlStuffUInt32( dest, specColor ); \
+            memcpy( dest, src, uvChanSize ); \
+            src += uvChanSize; \
+            dest += uvChanSize; \
+        } \
+    } \
+    else \
+    { \
+        uint8_t hiChan = localUVWChans >> 8; \
+        uint8_t loChan = localUVWChans & 0xff; \
+        /* Copy whilst blending */\
+        for( i = 0; i < count; i++ ) \
+        { \
+            hsVector3 srcUVWs[plGeometrySpan::kMaxNumUVChannels]; \
+            hsVector3 dstUVWs[plGeometrySpan::kMaxNumUVChannels]; \
+            \
+            /* Extract data */\
+            src = inlExtractPoint( src, pt ); \
+            for( j = 0, weightSum = 0; j < numWeights; j++ ) \
+            { \
+                src = inlExtractFloat( src, weights[ j ] ); \
+                weightSum += weights[ j ]; \
+            } \
+            weights[ j ] = 1 - weightSum; \
+            \
+            if( format & plGBufferGroup::kSkinIndices ) \
+            { \
+                src = inlExtractUInt32( src, indices ); \
+            } \
+            else \
+            { \
+                indices = 1 << 8; \
+            } \
+            \
+            src = inlExtractPoint( src, vec ); \
+            src = inlExtractUInt32( src, color ); \
+            src = inlExtractUInt32( src, specColor ); \
+            \
+            uint8_t k; \
+            for( k = 0; k < numUVs; k++ ) \
+            { \
+                src = inlExtractPoint( src, srcUVWs[k] ); \
+            } \
+            memcpy( dstUVWs, srcUVWs, uvChanSize); \
+            dstUVWs[loChan].Set(0,0,0); \
+            dstUVWs[hiChan].Set(0,0,0); \
+            \
+            /* Blend */\
+            destPt.Set( 0, 0, 0 ); \
+            destNorm.Set( 0, 0, 0 ); \
+            for( j = 0; j < numWeights + 1; j++ ) \
+            { \
+                if( weights[ j ] ) \
+                { \
+                    /*
                     MATRIXMULTBEGIN(matrixPalette[indices & 0xff], weights[j]);
 
                     MATRIXMULTPOINTADD(destPt, pt);
                     MATRIXMULTVECTORADD(destNorm, vec);
                     MATRIXMULTVECTORADD(dstUVWs[loChan], srcUVWs[loChan]);
                     MATRIXMULTVECTORADD(dstUVWs[hiChan], srcUVWs[hiChan]);
-                }
-
-                indices >>= 8;
-            }
-            // Probably don't really need to renormalize this. There errors are
-            // going to be subtle and "smooth".
-//          hsFastMath::NormalizeAppr(destNorm);
-//          hsFastMath::NormalizeAppr(dstUVWs[loChan]);
-//          hsFastMath::NormalizeAppr(dstUVWs[hiChan]);
-
-#ifdef MF_RECALC_BOUNDS
-            inlTESTPOINT(destPt, minX, minY, minZ, maxX, maxY, maxZ);
-#endif // MF_RECALC_BOUNDS
-
-            // Slam data into position now
-            dest = inlStuffPoint( dest, destPt );
-            dest = inlStuffPoint( dest, destNorm );
-            dest = inlStuffUInt32( dest, color );
-            dest = inlStuffUInt32( dest, specColor );
-            memcpy( dest, dstUVWs, uvChanSize );
-            dest += uvChanSize;
-        }
-    }
-#ifdef MF_RECALC_BOUNDS
-    hsBounds3Ext wBnd;
-    wBnd.Reset(&hsPoint3(minX, minY, minZ));
-    wBnd.Union(&hsPoint3(maxX, maxY, maxZ));
-    span->fWorldBounds = wBnd;
-#endif // MF_RECALC_BOUNDS
+                    */
+#define BLENDVERTEND \
+                } \
+                \
+                indices >>= 8; \
+            } \
+            /* Probably don't really need to renormalize this. There errors are
+            // going to be subtle and "smooth". */\
+            /* hsFastMath::NormalizeAppr(destNorm); */\
+            /* hsFastMath::NormalizeAppr(dstUVWs[loChan]); */\
+            /* hsFastMath::NormalizeAppr(dstUVWs[hiChan]); */\
+            \
+            /* Slam data into position now */\
+            dest = inlStuffPoint( dest, destPt ); \
+            dest = inlStuffPoint( dest, destNorm ); \
+            dest = inlStuffUInt32( dest, color ); \
+            dest = inlStuffUInt32( dest, specColor ); \
+            memcpy( dest, dstUVWs, uvChanSize ); \
+            dest += uvChanSize; \
+        } \
+    }
+
+void plDXPipeline::blend_vert_buffer_fpu( plSpan* span,
+                                          hsMatrix44* matrixPalette, int numMatrices,
+                                          const uint8_t *src, uint8_t format, uint32_t srcStride,
+                                          uint8_t *dest, uint32_t destStride, uint32_t count,
+                                          uint16_t localUVWChans )
+{
+    BLENDVERTSTART
+                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+    BLENDVERTMID
+                    MATRIXMULTBEGIN_FPU(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_FPU(destPt, pt);
+                    MATRIXMULTVECTORADD_FPU(destNorm, vec);
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[loChan], srcUVWs[loChan]);
+                    MATRIXMULTVECTORADD_FPU(dstUVWs[hiChan], srcUVWs[hiChan]);
+
+    BLENDVERTEND
+}
+
+void plDXPipeline::blend_vert_buffer_sse3( plSpan* span,
+                                           hsMatrix44* matrixPalette, int numMatrices,
+                                           const uint8_t *src, uint8_t format, uint32_t srcStride,
+                                           uint8_t *dest, uint32_t destStride, uint32_t count,
+                                           uint16_t localUVWChans )
+{
+#ifdef HS_SSE3
+    BLENDVERTSTART
+                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+    BLENDVERTMID
+                    MATRIXMULTBEGIN_SSE3(matrixPalette[indices & 0xff], weights[j]);
+
+                    MATRIXMULTPOINTADD_SSE3(destPt, pt);
+                    MATRIXMULTVECTORADD_SSE3(destNorm, vec);
+                    MATRIXMULTVECTORADD_SSE3(dstUVWs[loChan], srcUVWs[loChan]);
+                    MATRIXMULTVECTORADD_SSE3(dstUVWs[hiChan], srcUVWs[hiChan]);
+    BLENDVERTEND
+#endif // HS_SSE3
 }
 
 // ISetPipeConsts //////////////////////////////////////////////////////////////////
diff --git a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
index cdc4e2e4..1d83fce9 100644
--- a/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
+++ b/Sources/Plasma/PubUtilLib/plPipeline/plDXPipeline.h
@@ -465,7 +465,8 @@ protected:
     void            IBlendVertsIntoBuffer( plSpan* span, 
                                             hsMatrix44* matrixPalette, int numMatrices,
                                             const uint8_t *src, uint8_t format, uint32_t srcStride, 
-                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans );
+                                            uint8_t *dest, uint32_t destStride, uint32_t count, uint16_t localUVWChans )
+                                                { blend_vert_buffer.call(span, matrixPalette, numMatrices, src, format, srcStride, dest, destStride, count, localUVWChans); };
     hsBool          ISoftwareVertexBlend( plDrawableSpans* drawable, const hsTArray<int16_t>& visList );
 
 
@@ -734,7 +735,7 @@ public:
     virtual void                        GetDepth(float& hither, float& yon) const;
     virtual void                        SetDepth(float hither, float yon);
 
-    virtual float                    GetZBiasScale() const;
+    virtual float                       GetZBiasScale() const;
     virtual void                        SetZBiasScale(float scale);
 
     virtual const hsMatrix44&           GetWorldToCamera() const;
@@ -798,6 +799,13 @@ public:
     virtual int                         GetMaxAnisotropicSamples();
     virtual int                         GetMaxAntiAlias(int Width, int Height, int ColorDepth);
 
+
+    //  CPU-optimized functions
+protected:
+    typedef void(*blend_vert_buffer_ptr)(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static void blend_vert_buffer_fpu(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static void blend_vert_buffer_sse3(plSpan*, hsMatrix44*, int, const uint8_t *, uint8_t , uint32_t, uint8_t *, uint32_t, uint32_t, uint16_t);
+    static hsFunctionDispatcher<blend_vert_buffer_ptr> blend_vert_buffer;
 };