See what's going on with flipcode!




This section of the archives stores flipcode's complete Developer Toolbox collection, featuring a variety of mini-articles and source code contributions from our readers.

 

  Perlin Noise Functions (SIMD)
  Submitted by



Perlin Noise2 and Noise3 functions Optimised for Intel SIMD (PIII CeleronII) Instruction Set.

SIMD versions process 4 input vectors at once.

SIMD versions run more than twice as fast :)

This is my first attempt at using SIMD so constructive comments welcome - any one from Intel reading ?

Please excuse the gross use of prefetches! I just threw them in there (they ARE needed)!

I'm SURE these are sub-optimal.

Compiled with Intel's excellent Performance Compiler V5.0 30 day trial version from Intel's Website (boy does this produce fast code even with plain, vanilla C!!)

bye

Rob James

Download Associated File: simdtest.c (22,808 bytes)

/********************************************************
Perlin Noise2 and Noise3 functions Optimised for 
Intel SIMD (PIII CeleronII) Instruction Set.

SIMD versions process 4 input vectors at once.

SIMD versions run more than twice as fast :)

This is my first attempt at using SIMD so constructive comments welcome - any one from Intel reading ?

Please excuse the gross use of prefetches! I just threw them in there (they ARE needed)!

I'm SURE these are sub-optimal.

Compiled with Intel's excellent Performance Compiler V5.0 30 day trial version from Intel's Website (Boy does this produce fast with vanilla C!!)

Rob James

email robjames@europe.com home http://www.pocketmoon.com April 2001

******************************************************/


#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <mathf.h> #include <xmmintrin.h> // intel simd header //*** Note use of floats *** //SIMD is basically 4 floats at once (or 2 doubles...) typedef struct {float x; float y; float z;} Vector;

// Standard Perlin 'optimised' noise functions void Noise2 (Vector vec1,float *result); void Noise3 (Vector vec1,float *result);

// Intel SIMD (PIII or CeleronII) optimised versions void Noise24 (Vector vec1,Vector vec2,Vector vec3,Vector vec4,float *result); void Noise34 (Vector vec1,Vector vec2,Vector vec3,Vector vec4,float *result);

//to initialise the 'noise' lookup arrays void PInitNoise();

//test functions for profiling void tnoise2(); void tnoise3();

//from standard Noise functions #define B 0x100 #define BM 0xff //#define N 0x100000 #define N ((float)1048576.0) #define NP 12 /* 2^N */ #define NM 0xfff int p[B + B + 2];

//alignment required for SIMD intrinsics to work _declspec (align(32)) float g2[B + B + 2][2];//CHANGED FROM DOUBLE TO FLOAT and 3 to 2// _declspec (align(32)) float g3[B + B + 2][3];//CHANGED FROM DOUBLE TO FLOAT and 3 to 2//

/* cubic spline interpolation */ #define s_curve(t) ( t * t * (3. - 2. * t) )

/* linear interpolation */ #define lerp(t, a, b) ( a + t * (b - a) )

// For noise3 #define at3(rx,ry,rz) ( rx * q[0] + ry * q[1] + rz * q[2] )

//for noise2 #define at2(rx,ry) ( rx * q[0] + ry * q[1] )

#define setup(u,b0,b1,r0,r1)\ t = u + N;\ b0 = ((int)t) & BM;\ b1 = (b0+1) & BM;\ r0 = t - (int)t;\ r1 = r0 - 1.;



// anyone who's used Procedural noise should recognise these two functiomns ... void Noise3(Vector vec, float*result) { int bx0, bx1, by0, by1, bz0, bz1, b00, b10, b01, b11; float rx0, rx1, ry0, ry1, rz0, rz1, *q, sx, sy, sz; float a, b, c, d, u, v,t; int i, j;

setup(vec.x, bx0,bx1, rx0,rx1); setup(vec.y, by0,by1, ry0,ry1); setup(vec.z, bz0,bz1, rz0,rz1);

i = p[ bx0 ]; j = p[ bx1 ];

b00 = p[ i + by0 ]; b10 = p[ j + by0 ]; b01 = p[ i + by1 ]; b11 = p[ j + by1 ];

sx = s_curve(rx0); sy = s_curve(ry0); sz = s_curve(rz0);

q = g3[ b00 + bz0 ]; u = at3(rx0,ry0,rz0); q = g3[ b10 + bz0 ]; v = at3(rx1,ry0,rz0); a = lerp(sx, u, v);

q = g3[ b01 + bz0 ]; u = at3(rx0,ry1,rz0); q = g3[ b11 + bz0 ]; v = at3(rx1,ry1,rz0); b = lerp(sx, u, v);

c = lerp(sy, a, b); /* interpolate in y at lo z */

q = g3[ b00 + bz1 ]; u = at3(rx0,ry0,rz1); q = g3[ b10 + bz1 ]; v = at3(rx1,ry0,rz1); a = lerp(sx, u, v);

q = g3[ b01 + bz1 ]; u = at3(rx0,ry1,rz1); q = g3[ b11 + bz1 ]; v = at3(rx1,ry1,rz1); b = lerp(sx, u, v);

d = lerp(sy, a, b); /* interpolate in y at hi z */ *result = 1.5 * lerp(sz, c, d); /* interpolate in z */

}

void Noise2(Vector vec, float *result) { int bx0, bx1, by0, by1, b00, b10, b01, b11; float rx0, rx1, ry0, ry1, *q, sx, sy, a, b, u, v,t; int i, j;

setup(vec.x, bx0,bx1, rx0,rx1); setup(vec.y, by0,by1, ry0,ry1);

i = p[ bx0 ]; j = p[ bx1 ];

b00 = p[ i + by0 ]; b10 = p[ j + by0 ]; b01 = p[ i + by1 ]; b11 = p[ j + by1 ];

sx = s_curve(rx0); sy = s_curve(ry0);

q = g2[ b00 ]; /* get random gradient */ u = at2(rx0,ry0); /* get weight on lo x side (lo y) */ q = g2[ b10 ]; v = at2(rx1,ry0); /* get weight on hi x side (lo y) */ a = lerp(sx, u, v); /* get value at distance sx between u & v */

/* similarly at hi y... */ q = g2[ b01 ]; u = at2(rx0,ry1); q = g2[ b11 ]; v = at2(rx1,ry1); b = lerp(sx, u, v);

*result = 1.5 * lerp(sy, a, b); /* interpolate in y */ }

// Now my 4-at-a-time functions using Intels SIMD intrinsics... void Noise24( Vector vec1, Vector vec2, Vector vec3, Vector vec4, float *result)/* MUST BE ALIGN(16)*/ {

int b000, b001, b002, b003, b100, b101, b102, b103, b010, b011, b012, b013, b110, b111, b112, b113;

int bx00, bx01, bx02, bx03, bx10, bx11, bx12, bx13, by00, by01, by02, by03, by10, by11, by12, by13; float tx0,tx1,tx2,tx3, ty0,ty1,ty2,ty3;

// first sight of an intrinsic! __m128 rx0, rx1, ry0, ry1,tmp,tmp2, q0,q1; __m128 sx, sy, a, b, u, v , two, three;

int i0, i1, i2, i3, j1,j2,j3,j0;

// put (vec.x + 0x100000) into a temp var ?? // ISSUE : bx0 etc are ints converted from floats // this is possible using __m64 _mm_cvtps_pi16( __m128 a) (composite) // but would entail using MMX (swich penalty) hmmm... // // When mixing MMX and SIMD we need to do a mm_empty - this could be slow // maybe not needed when using convert ??? // #define MM 1048576.0

int itx1, itx2, itx3, itx0, ity1, ity2, ity3, ity0;

/* __m64 _mm_cvtps_pi16( __m128 a) (composite) Convert the four single precision FP values in a to four signed 16-bit integer values. r0 := (short)a0 r1 := (short)a1 r2 := (short)a2 r3 := (short)a3 */

tx0 = (vec1.x + MM); tx1 = (vec2.x + MM) ; tx2 = (vec3.x + MM) ; tx3 = (vec4.x + MM) ;

itx0 = (int)(tx0); itx1 = (int)(tx1); itx2 = (int)(tx2); itx3 = (int)(tx3);

bx00 = itx0 & 0xff; bx10 = (bx00+1) & 0xff;

bx01 = itx1 & 0xff; bx11 = (bx01+1) & 0xff;

bx02 = itx2 & 0xff; bx12 = (bx02+1) & 0xff; bx03 = itx3 & 0xff; bx13 = (bx03+1) & 0xff;

ty0 = (vec1.y + MM) ; ity0 = (int)(ty0);//-0.5 by00 = ity0 & 0xff; by10 = (by00+1) & 0xff; ty1 = (vec2.y + MM) ; ity1 = (int)(ty1);//-.5 by01 = ity1 & 0xff; by11 = (by01+1) & 0xff;

ty2 = (vec3.y + MM) ; ity2 = (int)(ty2);//-0.5 by02 = ity2 & 0xff; by12 = (by02+1) & 0xff;

ty3 = (vec4.y + MM) ; ity3 = (int)(ty3);//-0.5 by03 = ity3 & 0xff; by13 = (by03+1) & 0xff;

// could user post incrementing pointers here... i0 = p[ bx00 ]; j0 = p[ bx10 ]; i1 = p[ bx01 ]; j1 = p[ bx11 ]; i2 = p[ bx02 ]; j2 = p[ bx12 ]; i3 = p[ bx03 ]; j3 = p[ bx13 ];

b000 = p[ i0 + by00 ]; b010 = p[ i0 + by10 ]; b100 = p[ j0 + by00 ]; b110 = p[ j0 + by10 ]; b001 = p[ i1 + by01 ]; b011 = p[ i1 + by11 ]; b101 = p[ j1 + by01 ]; b111 = p[ j1 + by11 ]; b002 = p[ i2 + by02 ]; b012 = p[ i2 + by12 ]; b102 = p[ j2 + by02 ]; b112 = p[ j2 + by12 ];

b003 = p[ i3 + by03 ]; b013 = p[ i3 + by13 ]; b103 = p[ j3 + by03 ]; b113 = p[ j3 + by13 ];

//!! try using FAST floattoint ... - tried it, slower :) rx0 = _mm_set_ps( tx0 - itx0, tx1 - itx1, tx2 - itx2, tx3 - itx3);

// may be faster as another mm_set_ps with -1 at the end of each term tmp = _mm_set_ps1(1.0 ) ; rx1 = _mm_sub_ps(rx0, tmp); ry0 = _mm_set_ps( ty0 - ity0, ty1 - ity1, ty2 - ity2, ty3 - ity3);

ry1 = _mm_sub_ps (ry0 ,tmp);

//Now, these prefetches make a HUGE difference. //Without them the whole function is memory bound and no faster //than the original functions // //I have applied no real science in placement of these prefetches //I just chuck them in a few statements before I need them! _mm_prefetch((char*)&(g2[b000]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b001]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b002]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b003]),_MM_HINT_NTA);

//you will now see original code as comments followed by simd equivalent... // sx = rx0 * rx0 * (3.0 - 2.0 * rx0); two = _mm_set_ps1 (2.0) ; // Might be good to PRESTORE the constant arrays 1.0, 2.0 etc sx = _mm_mul_ps (rx0,two); three = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sx); tmp = _mm_mul_ps (tmp2,rx0); sx = _mm_mul_ps (tmp, rx0);

_mm_prefetch((char*)&(g2[b100]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b101]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b102]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b103]),_MM_HINT_NTA);

//sy = ry0 * ry0 * (3.0 - 2.0 * ry0); //tmp = _mm_set_ps1 (2.0) ; sy = _mm_mul_ps (ry0,two);

//tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sy); tmp = _mm_mul_ps (tmp2,ry0); sy = _mm_mul_ps (tmp, ry0);

_mm_prefetch((char*)&(g2[b010]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b011]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b012]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b013]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b110]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b111]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b112]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b113]),_MM_HINT_NTA);

// q = g2[ b00 ]; /* get random gradient */ // u = rx0 * q[0] + ry0 * q[1]; q0 = _mm_set_ps(g2[b000][0], g2[b001][0], g2[b002][0], g2[b003][0] ); q1 = _mm_set_ps(g2[b000][1], g2[b001][1], g2[b002][1], g2[b003][1] ); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); u = _mm_add_ps(tmp, tmp2);

// q = g2[ b10 ]; // v = rx1 * q[0] + ry0 * q[1] ; q0 = _mm_set_ps(g2[b100][0], g2[b101][0], g2[b102][0], g2[b103][0] ); q1 = _mm_set_ps(g2[b100][1], g2[b101][1], g2[b102][1], g2[b103][1] ); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); v = _mm_add_ps(tmp, tmp2);

// a = u + sx * (v-u); /* get value at distance sx between u & v */ tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2);

// q = g2[ b01 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g2[b010][0], g2[b011][0], g2[b012][0], g2[b013][0] ); q1 = _mm_set_ps(g2[b010][1], g2[b011][1], g2[b012][1], g2[b013][1] ); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); u = _mm_add_ps(tmp, tmp2);

// q = g2[ b11 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g2[b110][0], g2[b111][0], g2[b112][0], g2[b113][0] ); q1 = _mm_set_ps(g2[b110][1], g2[b111][1], g2[b112][1], g2[b113][1] ); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); v = _mm_add_ps(tmp, tmp2);

// b = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2);

// result = 1.5 * (a + sy * (b-a)) /* interpolate in y */ why 1.5 ?? tmp = _mm_sub_ps(b,a); tmp2 = _mm_mul_ps(tmp,sy); tmp = _mm_add_ps(tmp2,a); tmp2 = _mm_set_ps1(1.5); a = _mm_mul_ps(tmp2,tmp);

_mm_store_ps(result, a ); // aligned

//phew!! // As you see doing things 4-at-a-time does take some getting used to! }

void Noise34(Vector vec1, Vector vec2, Vector vec3, Vector vec4, float *result)/* MUST BE ALIGN(16)*/ {

// same as Noise24 but with the extra Z component included... int b000, b001, b002, b003, b100, b101, b102, b103, b010, b011, b012, b013, b110, b111, b112, b113;

int bx00, bx01, bx02, bx03, bx10, bx11, bx12, bx13, by00, by01, by02, by03, by10, by11, by12, by13, bz00, bz01, bz02, bz03, bz10, bz11, bz12, bz13; float tx0,tx1,tx2,tx3, ty0,ty1,ty2,ty3, tz0,tz1,tz2,tz3;

__m128 rx0, rx1, ry0, ry1, rz0, rz1, tmp,tmp2,tmp3,tmp4, q0,q1,q2; __m128 sx, sy, sz, a, b,c ,d, u, v , two, three;

int i0, i1, i2, i3, j1,j2,j3,j0;

// put (vec.x + 0x100000) into a temp var ?? // ISSUE : bx0 etc are ints converted from floats // this is possible using __m64 _mm_cvtps_pi16( __m128 a) (composite) // but would entail using MMX hmmm // // When mixing MMX and SIMD we need to do a mm_empty - this could be slow // maybe not needed when using convert ??? // #define MM 1048576.0

int itx1, itx2, itx3, itx0, ity1, ity2, ity3, ity0, itz1, itz2, itz3, itz0;

/* __m64 _mm_cvtps_pi16( __m128 a) (composite) Convert the four single precision FP values in a to four signed 16-bit integer values. r0 := (short)a0 r1 := (short)a1 r2 := (short)a2 r3 := (short)a3 */

tx0 = (vec1.x + MM); tx1 = (vec2.x + MM) ; tx2 = (vec3.x + MM) ; tx3 = (vec4.x + MM) ;

itx0 = (int)(tx0); itx1 = (int)(tx1); itx2 = (int)(tx2); itx3 = (int)(tx3);

bx00 = itx0 & 0xff; bx10 = (bx00+1) & 0xff;

bx01 = itx1 & 0xff; bx11 = (bx01+1) & 0xff;

bx02 = itx2 & 0xff; bx12 = (bx02+1) & 0xff; bx03 = itx3 & 0xff; bx13 = (bx03+1) & 0xff;



ty0 = (vec1.y + MM) ; ity0 = (int)(ty0); by00 = ity0 & 0xff; by10 = (by00+1) & 0xff; ty1 = (vec2.y + MM) ; ity1 = (int)(ty1); by01 = ity1 & 0xff; by11 = (by01+1) & 0xff;

ty2 = (vec3.y + MM) ; ity2 = (int)(ty2); by02 = ity2 & 0xff; by12 = (by02+1) & 0xff;

ty3 = (vec4.y + MM) ; ity3 = (int)(ty3); by03 = ity3 & 0xff; by13 = (by03+1) & 0xff;

// ZED tz0 = (vec1.z + MM) ; itz0 = (int)(tz0); bz00 = itz0 & 0xff; bz10 = (bz00+1) & 0xff; tz1 = (vec2.z + MM) ; itz1 = (int)(tz1); bz01 = itz1 & 0xff; bz11 = (bz01+1) & 0xff;

tz2 = (vec3.z + MM) ; itz2 = (int)(tz2); bz02 = itz2 & 0xff; bz12 = (bz02+1) & 0xff;

tz3 = (vec4.z + MM) ; itz3 = (int)(tz3); bz03 = itz3 & 0xff; bz13 = (bz03+1) & 0xff;



// could user post incrementing pointers here... i0 = p[ bx00 ]; j0 = p[ bx10 ]; i1 = p[ bx01 ]; j1 = p[ bx11 ]; i2 = p[ bx02 ]; j2 = p[ bx12 ]; i3 = p[ bx03 ]; j3 = p[ bx13 ];

b000 = p[ i0 + by00 ]; b010 = p[ i0 + by10 ]; b100 = p[ j0 + by00 ]; b110 = p[ j0 + by10 ]; b001 = p[ i1 + by01 ]; b011 = p[ i1 + by11 ]; b101 = p[ j1 + by01 ]; b111 = p[ j1 + by11 ]; b002 = p[ i2 + by02 ]; b012 = p[ i2 + by12 ]; b102 = p[ j2 + by02 ]; b112 = p[ j2 + by12 ];

b003 = p[ i3 + by03 ]; b013 = p[ i3 + by13 ]; b103 = p[ j3 + by03 ]; b113 = p[ j3 + by13 ];

// may be faster as another mm_set_ps with -1 at the end of each term tmp = _mm_set_ps1(1.0) ;

//!! try using FAST floattoint ...??? rx0 = _mm_set_ps(tx0 - itx0, tx1 - itx1, tx2 - itx2, tx3 - itx3); rx1 = _mm_sub_ps(rx0, tmp);

ry0 = _mm_set_ps(ty0 - ity0, ty1 - ity1, ty2 - ity2, ty3 - ity3); ry1 = _mm_sub_ps (ry0 ,tmp);

rz0 = _mm_set_ps( tz0 - itz0, tz1 - itz1, tz2 - itz2, tz3 - itz3); rz1 = _mm_sub_ps (rz0 ,tmp);

_mm_prefetch((char*)&(g3[b000+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b001+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b002+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b003+bz03]),_MM_HINT_NTA);

// sx = rx0 * rx0 * (3.0 - 2.0 * rx0); two = _mm_set_ps1 (2.0) ; // Might be good to PRESTORE the constant arrays 1.0, 2.0 etc sx = _mm_mul_ps (rx0,two); three = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sx); tmp = _mm_mul_ps (tmp2,rx0); sx = _mm_mul_ps (tmp, rx0);

_mm_prefetch((char*)&(g3[b100+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b101+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b102+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b103+bz03]),_MM_HINT_NTA);

//sy = ry0 * ry0 * (3.0 - 2.0 * ry0); //tmp = _mm_set_ps1 (2.0) ; sy = _mm_mul_ps (ry0,two);

//tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sy); tmp = _mm_mul_ps (tmp2,ry0); sy = _mm_mul_ps (tmp, ry0);

_mm_prefetch((char*)&(g3[b010+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b011+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b012+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b013+bz03]),_MM_HINT_NTA);

//sz = rz0 * rz0 * (3.0 - 2.0 * rz0); //tmp = _mm_set_ps1 (2.0) ; sz = _mm_mul_ps (rz0,two); //tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sz); tmp = _mm_mul_ps (tmp2,rz0); sz = _mm_mul_ps (tmp, rz0);

_mm_prefetch((char*)&(g3[b110+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b111+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b112+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b113+bz03]),_MM_HINT_NTA);

// q = g3[ b00 ]; /* get random gradient */ // u = rx0 * q[0] + ry0 * q[1] + rz0 * q[2]; q0 = _mm_set_ps(g3[b000+bz00][0], g3[b001+bz01][0], g3[b002+bz02][0], g3[b003+bz03][0] ); q1 = _mm_set_ps(g3[b000+bz00][1], g3[b001+bz01][1], g3[b002+bz02][1], g3[b003+bz03][1] ); q2 = _mm_set_ps(g3[b000+bz00][2], g3[b001+bz01][2], g3[b002+bz02][2], g3[b003+bz03][2] );

tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3);

_mm_prefetch((char*)&(g3[b000+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b001+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b002+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b003+bz13]),_MM_HINT_NTA); // q = g3[ b10 + bz0 ]; // v = rx1 * q[0] + ry0 * q[1] ; q0 = _mm_set_ps(g3[b100+bz00][0], g3[b101+bz01][0], g3[b102+bz02][0], g3[b103+bz03][0] ); q1 = _mm_set_ps(g3[b100+bz00][1], g3[b101+bz01][1], g3[b102+bz02][1], g3[b103+bz03][1] ); q2 = _mm_set_ps(g3[b100+bz00][2], g3[b101+bz01][2], g3[b102+bz02][2], g3[b103+bz03][2] );

tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);



// a = u + sx * (v-u); /* get value at distance sx between u & v */ tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2);

_mm_prefetch((char*)&(g3[b100+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b101+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b102+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b103+bz13]),_MM_HINT_NTA); // q = g3[ b01 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b010+bz00][0], g3[b011+bz01][0], g3[b012+bz02][0], g3[b013+bz03][0] ); q1 = _mm_set_ps(g3[b010+bz00][1], g3[b011+bz01][1], g3[b012+bz02][1], g3[b013+bz03][1] ); q2 = _mm_set_ps(g3[b010+bz00][2], g3[b011+bz01][2], g3[b012+bz02][2], g3[b013+bz03][2] );

tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3);

_mm_prefetch((char*)&(g3[b010+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b011+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b012+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b013+bz13]),_MM_HINT_NTA); // q = g3[ b11 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b110+bz00][0], g3[b111+bz01][0], g3[b112+bz02][0], g3[b113+bz03][0] ); q1 = _mm_set_ps(g3[b110+bz00][1], g3[b111+bz01][1], g3[b112+bz02][1], g3[b113+bz03][1] ); q2 = _mm_set_ps(g3[b110+bz00][2], g3[b111+bz01][2], g3[b112+bz02][2], g3[b113+bz03][2] );

tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);



// b = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2);

// c = a + sy * (a-b); tmp = _mm_sub_ps (b,a); tmp2 = _mm_mul_ps (tmp,sy); c = _mm_add_ps (a,tmp2);

/*NEW */

_mm_prefetch((char*)&(g3[b110+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b111+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b112+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b113+bz13]),_MM_HINT_NTA);

// q = g3[ b00 + bz1 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b000+bz10][0], g3[b001+bz11][0], g3[b002+bz12][0], g3[b003+bz13][0] ); q1 = _mm_set_ps(g3[b000+bz10][1], g3[b001+bz11][1], g3[b002+bz12][1], g3[b003+bz13][1] ); q2 = _mm_set_ps(g3[b000+bz10][2], g3[b001+bz11][2], g3[b002+bz12][2], g3[b003+bz13][2] );

tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3);

// q = g3[ b10 + bz1 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b100+bz10][0], g3[b101+bz11][0], g3[b102+bz12][0], g3[b103+bz13][0] ); q1 = _mm_set_ps(g3[b100+bz10][1], g3[b101+bz11][1], g3[b102+bz12][1], g3[b103+bz13][1] ); q2 = _mm_set_ps(g3[b100+bz10][2], g3[b101+bz11][2], g3[b102+bz12][2], g3[b103+bz13][2] );

tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);

// a = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2);



// q = g3[ b01 + bz1 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b010+bz10][0], g3[b011+bz11][0], g3[b012+bz12][0], g3[b013+bz13][0] ); q1 = _mm_set_ps(g3[b010+bz10][1], g3[b011+bz11][1], g3[b012+bz12][1], g3[b013+bz13][1] ); q2 = _mm_set_ps(g3[b010+bz10][2], g3[b011+bz11][2], g3[b012+bz12][2], g3[b013+bz13][2] );

tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3);

q0 = _mm_set_ps(g3[b110+bz10][0], g3[b111+bz11][0], g3[b112+bz12][0], g3[b113+bz13][0] ); q1 = _mm_set_ps(g3[b110+bz10][1], g3[b111+bz11][1], g3[b112+bz12][1], g3[b113+bz13][1] ); q2 = _mm_set_ps(g3[b110+bz10][2], g3[b111+bz11][2], g3[b112+bz12][2], g3[b113+bz13][2] );

tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);

tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2);

tmp = _mm_sub_ps (b,a); tmp2 = _mm_mul_ps (tmp,sy); d = _mm_add_ps (a,tmp2);

// result = 1.5 * (c + sz * (d-c)) /* interpolate in y */ why 1.5 ?? tmp = _mm_sub_ps(d,c); tmp2 = _mm_mul_ps(tmp,sz); tmp = _mm_add_ps(tmp2,c); tmp2 = _mm_set_ps1(1.5); a = _mm_mul_ps(tmp2,tmp);

_mm_store_ps(result, a ); // aligned }





#include "noisetab.c" void PInitNoise() { int i;

//set up our two random number lookup tables //one for 2d noise the other for 3d... for (i= 0; i < B+B+2; ++i) { p[i]= p_precomputed[i]; g3[i][0] = g2[i][0] = g_precomputed[i][0]; g3[i][1] = g2[i][1] = g_precomputed[i][1]; g3[i][2] = g_precomputed[i][2]; } }

void tnoise2 () {

int x; Vector v;

// some aligned storage for test reults _declspec (align(16)) float a[1000] ; _declspec (align(16)) float b[1000] ;

v.x = (float)rand()/10000.0; v.y = (float)rand()/10000.0; v.z = (float)rand()/10000.0;

//printf("%f %f %f :",v.x,v.y,v.z); for (x=0;x<1000;x++) Noise2(v, &(a[x]));

for (x=0;x<1000;x+=4) Noise24(v,v,v,v, &(b[x]));

//printf("%f %f %f \n",a[0]-b[0],a[456]-b[456],a[999]-b[999]); }



void tnoise3 () {

_declspec (align(16)) float a[1000] ; _declspec (align(16)) float b[1000] ;

int x; Vector v;

v.x = (float)rand()/10000.0; v.y = (float)rand()/10000.0; v.z = (float)rand()/10000.0;

// printf("%f %f %f\t:",v.x,v.y,v.z); for (x=0;x<1000;x++) Noise3(v, &(a[x]));

for (x=0;x<1000;x+=4) Noise34(v, v, v, v, &(b[x]));

// printf("%f %f %f %f\n",a[0],b[0],a[999],b[999]); }

//Almost there... int main(int argc, char** argv) { int x;

PInitNoise(); for (x=0;x < 1000;x++) { tnoise2(); tnoise3(); } }


The zip file viewer built into the Developer Toolbox made use of the zlib library, as well as the zlibdll source additions.

 

Copyright 1999-2008 (C) FLIPCODE.COM and/or the original content author(s). All rights reserved.
Please read our Terms, Conditions, and Privacy information.