vs.1.1

dcl_position v0

//m4x4 oPos, v0, c0


/*
In fact, I was trying to understand how it was possible to expand FRC into 4
instructions...
Actually, I can do it in 7 instructions :)

EXPP r0.y, r1.xxxx
MOV r0.x, r0.y
EXPP r0.y, r1.zzzz
MOV r0.z, r0.y
EXPP r0.y, r1.wwww
MOV r0.w, r0.y
EXPP r0.y, r1.yyyy
*/

/*
   // Constants for sin and cos. 3 term approximation seems plenty
   // (it's what i used for software sim, and had no visibly different
   // results than the math library functions).
   // When doing sin/cos together, some speedup might be obtained
   // with good pairing of ops doing them simultaneously. Also save
   // an instruction calculating r0^3.
        D3DXVECTOR4 vSin( 1.0f, -1.0f/6.0f, 1.0f/120.0f, -1.0f/5040.0f );
        D3DXVECTOR4 vCos( 1.0f, -1.0f/2.0f, 1.0f/ 24.0f, -1.0f/ 720.0f );
*/

/*
Cos():


  r1 = mul(r0, r0);     // r0^2
  r2 = mul(r1, r1);     // r0^4

  //cos
  r3 = mad( r1, vCos.yyyy, vCos.xxxx );
  r3 = mad( r2, vCos.zzzz, r3 );
*/

/*
Sin();
  r1 = mul(r0, r0);     // r0^3
  r1 = mul(r0, r1);
  r2 = mul(r1, r1);     // r0^6

  r3 = mad( r1, vSin.yyyy, r0 );
  r3 = mad( r2, vSin.zzzz, r3 );
*/

/*
SinCos():

  r1 = mul(r0, r0);     // r0^2
  r2 = mul(r1, r0);     // r0^3 // probably stall
  r3 = mul(r1, r1);     // r0^4
  r4 = mul(r2, r2);     // r0^6

  r5 = mad( r1, vCos.yyyy, vCos.xxxx );
  r6 = mad( r2, vSin.yyyy, r0 );
  r5 = mad( r3, vCos.zzzz, r5 );
  r6 = mad( r4, vSin.zzzz, r6 );

*/

/*
consts
   kOneOverEightNsqPi      = 1.f / ( 8.f * Pi * 4.f * 4.f );
   kPiOverTwo           = Pi / 2.f;
   kTwoPi               = Pi * 2.f;
   kPi                  = Pi;
*/
/*
CONSTANT REGISTERS
VOLATILE CONSTS - change per invocation
C0-C3 local2proj matrix
C4    color
C5    freq vector
C6    phase vector
C7    amplitude vector
C8    center0
C9    center1
C10      center2
C11      center3
C12      scrunch = (scrunch, -scrunch, 0, 1);
CONSTANT CONSTS - forever more
C13      SinConsts = (1.0f, -1.0f/6.0f, 1.0f/120.0f, -1.0f/5040.0f);
C14      CosConsts = (1.0f, -1.0f/2.0f, 1.0f/ 24.0f, -1.0f/ 720.0f);
C15      PiConsts = (1.f / 8*Pi*N^2, Pi/2, Pi, 2*Pi);
C16      numberConsts = (0.f, 0.5f, 1.f, 2.f);
//=====================================
TEMP REGISTERS
r6    accumPos
r7    accumCos
r8    toCenter_Y
r9    toCenter_X
r11      filter
r10      tempFloat
*/
// const float4 kCosConsts = float4(1.0f, -1.0f/2.0f, 1.0f/ 24.0f, -1.0f/ 720.0f);
// const float4 kSinConsts = float4(1.0f, -1.0f/6.0f, 1.0f/120.0f, -1.0f/5040.0f);

// const float4 kPiConsts = float4(1.f / (8.f * 3.1415f * 16f), 3.1415f*0.5f, 3.1415f, 3.1515f*2.f);
// const float4 k0512 = float4(0.f, 0.5f, 1.f, 2.f);

// accumPos = inPos;
   mov         r6, v0;
//
// For each wave
// {
//    // First, we want to filter out waves based on distance from the local origin
//    dist = dp3(inPos, inPos);
   dp3         r0, r6, r6;
//    dist *= kFreqSq.xyzw;
   mul         r0, r0, c5;
   mul         r0, r0, c5;
//    dist *= kOneOverEightNsqPi; // combine this into kFreqSq?
   mul         r0, r0, c15.xxxx;
//    dist = min(dist, kPiOverTwo);
   min         r0, r0, c15.yyyy;
//    filter = cos(dist);
   mul         r1, r0, r0;    // r0^2
   mul         r2, r1, r1;    // r1^2
   mul         r1, r1, c14.yyyy;
   add         r11, r1, c14.xxxx;
   mad         r11, r2, c14.zzzz, r11;


//    filter *= kAmplitude.xyzw;
//   mul         r11, r11, c7;
//    // Notice that if dist is a 4vec, all this can be simultaneously done for 4 waves at a time.
//
//    Find the x/y distances and stuff them into r9(x) and r8(y) respectively
   // toCenter_X.x = dir0.x * pos.x;
   // toCenter_Y.x = dir0.y * pos.y;
   mul      r0, c8, r6.xxxx;
   mad      r0, c9, r6.yyyy, r0;

//
//    dist = mad( dist, kFreq.xyzw, kPhase.xyzw);
   mul         r0, r0, c5;
   add          r0, r0, c6;
//
//    // Now we need dist mod'd into range [-Pi..Pi]
//    dist *= rcp(kTwoPi);
   rcp         r4, c15.wwww;
   add          r0, r0, c15.zzzz;
   mul         r0, r0, r4;
//    dist = frac(dist);
   expp     r1.y, r0.xxxx
   mov      r1.x, r1.yyyy
   expp     r1.y, r0.zzzz
   mov      r1.z, r1.yyyy
   expp     r1.y, r0.wwww
   mov      r1.w, r1.yyyy
   expp     r1.y, r0.yyyy
//    dist *= kTwoPi;
   mul         r0, r1, c15.wwww;
//    dist += -kPi;
   sub         r0, r0, c15.zzzz;

//
//    sincos(dist, sinDist, cosDist);
   // sin = r0 + r0^3 * vSin.y + r0^5 * vSin.z
   // cos = 1 + r0^2 * vCos.y + r0^4 * vCos.z
   mul         r1, r0, r0; // r0^2
   mul         r2, r1, r0; // r0^3 - probably stall
   mul         r3, r1, r1; // r0^4
   mul         r4, r1, r2; // r0^5
   mul         r5, r2, r3; // r0^7

   mul         r1, r1, c14.yyyy;       // r1 = r0^2 * vCos.y
   mad         r2, r2, c13.yyyy, r0;   // r2 = r0 + r0^3 * vSin.y
   add         r1, r1, c14.xxxx;       // r1 = 1 + r0^2 * vCos.y
   mad         r2, r4, c13.zzzz, r2;   // r2 = r0 + r0^3 * vSin.y + r0^5 * vSin.z
   mad         r1, r3, c14.zzzz, r1;   // r1 = 1 + r0^2 * vCos.y + r0^4 * vCos.z

   // r0^7 & r0^6 terms
   mul         r4, r4, r0; // r0^6
   mad         r2, r5, c13.wwww, r2;
   mad         r1, r4, c14.wwww, r1;

//mov    r2, r1;
   // r2 == sinDist
   // r1 == cosDist
//    sinDist *= filter;
   mul         r2, r2, r11;
//    sinDist *= kAmplitude.xyzw
   mul         r2, r2, c7;
//    height = dp4(sinDist, kOne);
//    accumPos.z += height; (but accumPos.z is currently 0).
   dp4         r6.z, r2, c16.zzzz;
//
//    cosDist *= kFreq.xyzw;
   mul         r1, r1, c5;
//    cosDist *= kAmplitude.xyzw; // Combine?
   mul         r1, r1, c7;
//    cosDist *= filter;
   mul         r1, r1, r11;
//
// accumCos = (0, 0, 0, 0);
   mov         r7, c16.xxxx;
//    temp = dp4( cosDist, toCenter_X );
//    accumCos.x += temp.xxxx; (but accumCos = (0,0,0,0)
   dp4         r7.x, r1, -c8
//
//    temp = dp4( cosDist, toCenter_Y );
//    accumCos.y += temp.xxxx;
   dp4         r7.y, r1, -c9
//
// }
//
// accumBin = (1, 0, -accumCos.x);
// accumTan = (0, 1, -accumCos.y);
// accumNorm = (accumCos.x, accumCos.y, 1);
   mov         r11, c16.xxzx;
   add         r11, r11, r7;
   dp3         r10.x, r11, r11;
   rsq         r10.x, r10.x;
   mul         r11, r11, r10.xxxx;

//
// // Scrunch in based on computed (normalized) normal
// temp = mul( accumNorm, kNegScrunchScale ); // kNegScrunchScale = (-scrunchScale, -scrunchScale, 0, 0);
// accumPos += temp;
   dp3          r10.x, r11, c18.zxw; // winddir.x, winddir.y, 0, 0
   // r10.x tells us whether our normal is opposed to the wind.
   // If opposed, r10.x = 0, else r10.x = 1.f;
   // We'll use this to kill the Scrunch on the back sides of waves.
   // We use it for position right here, and then again for the
   // normal just down a bit further.
   slt          r10.x, r10.x, c16.x;
   mul          r9, r10.xxxx, r11;

   mad         r6, r9, c12.yyzz, r6;

//   mul            r6.z, r6.z, r10.xxxx; DEBUG

//   mad         r6, r11, c12.yyzz, r6;

// accumNorm = mul (accumNorm, kScrunchScale ); // kScrunchScale = (scrunchScale, scrunchScale, 1, 1);
   // accumCos *= (scrunchScale, scrunchScale, 0, 0);

   mul          r2.x, r6.z, c12.x;
   mul          r2.x, r2.x, r10.x; // ???
   add          r2.x, r2.x, c16.z;

//   mul         r7, r7, c12.xxzz;
   mul          r7.xy, r7.xy, r2.xx;

// This is actually wrong, but useful right now for visualizing the generated coords.
// See below for correct version.

   sub          r3, c16.xxzx, r7.xyzz;

   // Normalize?

   // We can either calculate an orthonormal basis from the
   // computed normal, with Binormal = (0,1,0) X Normal, Tangent = Normal X (1,0,0),
   // or compute our basis directly from the partial derivatives, with
   // Binormal = (1, 0, -cosX), Tangent = (0, 1, -cosY), Normal = (cosX, cosY, 1)
   //
   // These work out to identically the same result, so we'll compute directly
   // from the partials because it takes 2 fewer instructions.
   //
   // Note that our basis is NOT orthonormal. The Normal is equal to
   // Binormal X Tangent, but Dot(Binormal, Tangent) != 0. The Binormal and Tangents
   // are both correct tangents to the surface, and their projections on the XY plane
   // are 90 degrees apart, but in 3-space, they are not orthogonal. Practical implications?
   // Not really. I'm actually not really sure which is more "proper" for bump mapping.
   //
   // Note also that we add when we should subtract and subtract when we should
   // add, so that r1, r2, r3 aren't Binormal, Tangent, Normal, but the rows
   // of our transform, (Bx, Tx, Nx), (By, Ty, Ny), (Bz, Tz, Nz). See below for
   // explanation.
   //
   // Binormal = Y % Normal
   // Cross product3 is:
   //   mul     res.xyz, a.yzx, b.zxy
   //   mad     res.xyz, -a.zxy, b.yzx, res.xyz
//   mul            r1.xyz, c16.zxx, r3.zxy;
//   mad            r1.xyz, -c16.xxz, r3.yzx, r1.xyz;

   // Tangent = Normal % X
//   mul            r2.xyz, r3.yzx, c16.xzx;
//   mad            r2.xyz, -r3.zxy, c16.xxz, r2;

   add          r1, c16.zxxx, r7.zzxz;
   add          r2, c16.xzxx, r7.zzyz;

   // Note that we're swapping z and y to match our environment map tools in max.
   // We do this through our normal map transform (oT1, oT2, oT3), making it
   // a concatenation of:
   //
   //   rotate about Z (blue) to turn our map into the wind
   //   windRot =   |   dirY    -dirX   0 |
   //               |   dirX    dirY    0 |
   //               |   0       0       1 |
   //
   //   swap our Y and Z axes to match our environment map
   //   swapYZ  =   |   1       0       0 |
   //               |   0       0       1 |
   //               |   0       1       0 |
   //
   //   rotate the normal into the surface's tangent space basis
   //   basis   =   |   Bx      Tx      Nx |
   //               |   By      Ty      Ny |
   //               |   Bz      Tz      Nz |
   //
   //   Note that we've constucted the basis by taking advantage of the
   //   matrix being a pure rotation, as noted below, so r1, r2 and r3
   //   are actually constructed as:
   //   basis   =   |   Bx      -By     -Bz |
   //               |   -Tx     Ty      -Tz |
   //               |   -Nx     -Ny     -Nz |
   //
   //   Then the final normal map transform is:
   //
   //       basis * swapYZ * windRot [ * normal ]


//   sub         r1.w, c17.x, r6.x;
//   sub         r2.w, c17.z, r6.z;
//   sub         r3.w, c17.y, r6.y;

  // Big note here. All this math can blow up if the camera position
   // is outside the environment sphere. It's assumed that's dealt
   // with in the app setting up the constants. For that reason, the
   // camera position used here might not be the real local camera position,
   // which is needed for the angular attenuation, so we burn another constant
   // with our pseudo-camera position. To restrain the pseudo-camera from
   // leaving the sphere, we make:
   //   pseudoPos = envCenter + (realPos - envCenter) * dist * R / (dist + R)
   // where dist = |realPos - envCenter|

   // So, our "finitized" eyeray is:
   //   camPos + D * t - envCenter = D * t - (envCenter - camPos)
   // with
   //   D = (pos - camPos) / |pos - camPos| // normalized usual eyeray
   // and
   //   t = D dot F + sqrt( (D dot F)^2 - G )
   // with
   //   F = (envCenter - camPos)    => c19.xyz
   //   G = F^2 - R^2               => c19.w
   //   R = environment radius.     => unused
   //
   // This all derives from the positive root of equation
   //   (camPos + (pos - camPos) * t - envCenter)^2 = R^2,
   // In other words, where on a sphere of radius R centered about envCenter
   // does the ray from the real camera position through this point hit.
   //
   // Note that F, G, and R are all constants (one point, two scalars).
   //
   // So first we calculate D into r0,
   // then D dot F into r10.x,
   // then (D dot F)^2 - G into r10.y
   // then rsq( (D dot F)^2 - G ) into r9.x;
   // then t = r10.z = r10.x + r10.y * r9.x;
   // and
   // r0 = D * t - (envCenter - camPos)
   //       = r0 * r10.zzzz - F;
   //
   sub          r0, r6, c17;
   dp3          r10.x, r0, r0;
   rsq          r10.x, r10.x;
   mul          r0, r0, r10.xxxx;

   dp3          r10.x, r0, c19;
   mad          r10.y, r10.x, r10.x, -c19.w;

   rsq          r9.x, r10.y;

   mad          r10.z, r10.y, r9.x, r10.x;

   mad          r0.xyz, r0, r10.zzz, -c19.xyz;

   mov          r1.w, -r0.x;
   mov          r2.w, -r0.y;
   mov          r3.w, -r0.z;

   // Now rotate our basis vectors into the wind
    dp3     r0.x, r1, c18.xyww;
    dp3     r0.y, r1, c18.zxww;
    mov     r1.xy, r0;

    dp3     r0.x, r2, c18.xyww;
    dp3     r0.y, r2, c18.zxww;
    mov     r2.xy, r0;

    dp3     r0.x, r3, c18.xyww;
    dp3     r0.y, r3, c18.zxww;
    mov     r3.xy, r0;

   mov          r0.w, c16.zzzz;

   dp3         r0.x, r1, r1;
   rsq         r0.x, r0.x;
   mul         oT1, r1.xyzw, r0.xxxw;
//   mul            r8, r1.xyzw, r0.xxxw; // VISUAL

   dp3         r0.x, r2, r2;
   rsq         r0.x, r0.x;
   mul         oT3, r2.xyzw, r0.xxxw;
//   mul            r9, r2.xyzw, r0.xxxw; // VISUAL

   dp3         r0.x, r3, r3;
   rsq         r0.x, r0.x;
   mul         oT2, r3.xyzw, r0.xxxw;
//   mul            r9, r3.xyzw, r0.xxxw; // VISUAL

//  mul        r3, r3.xzyw, r0.xxxw;
//  mul         r3.xy, r3, -c16.zzzz;

/*
   // Want:
   //    oT1 = (BIN.x, TAN.x, NORM.x, view2pos.x)
   //    oT2 = (BIN.y, TAN.y, NORM.y, view2pos.y)
   //    ot3 = (BIN.z, TAN.z, NORM.z, view2pos.z)
   // with BIN, TAN, and NORM normalized.
   // Unnormalized, we have
   //    BIN = (1, 0, -r7.x) where r7 == accumCos
   //    TAN = (0, 1, -r7.y)
   //    NORM= (r7.x, r7.y, 1)
   // So, unnormalized, we have
   //    oT1 = (1, 0, r7.x, view2pos.x)
   //    oT2 = (0, 1, r7.y, view2pos.y)
   //    oT3 = (-r7.x, -r7.y, 1, view2pos.z)
   // which is just reversing the signs on the accumCos
   // terms above. So the normalized version is just
   // reversing the signs on the normalized version above.
*/
//mov oT3, r4;

//
// // Transform position to screen
//
//
   m4x4     oPos, r6, c0;

// Still need to attenuate based on position
   mov         oD0, c4;

// This should be in local space after xforming v0
   dp4          r0.x, v0, c10;
   dp4          r0.y, v0, c11;
   mov          r0.zw, c16.xxxz;
   mov          oT0, r0
//   mov            oT0, v7;

// Questionble attenuation follows
    // Find vector from this point to camera and normalize
    sub         r0, c17, r6;
    dp3         r1.x, r0, r0;
    rsq         r1.x, r1.x;
    mul         r0, r0, r1.xxxx;
    // Dot that with the computed normal
    dp3         r1.x, r0, r11;
//  dp3         r1.x, r0, r3; // if you want the adjusted normal, you'll need to normalize/swizzle r3
    // Map dot=1 => 0, dot=0 => 1
    sub         r1.xyzw, c16.zzzz, r1.xxxx;
    add         r1.w, r1.wwww, c16.zzzz;
    mul         r1.w, r1.wwww, c16.yyyy;
    // No need to clamp, since the destination register (in the pixel shader)
    // will saturate [0..1] anyway.
    mul         oD1, r1, c20;
//  mov         oD1, r9;
//  mov         oD1, r8.xzyw;