This section of the archives stores flipcode's complete Developer Toolbox collection, featuring a variety of mini-articles and source code contributions from our readers.

 Perlin Noise Functions (SIMD)   Submitted by

Perlin Noise2 and Noise3 functions Optimised for Intel SIMD (PIII CeleronII) Instruction Set.

SIMD versions process 4 input vectors at once.

SIMD versions run more than twice as fast :)

This is my first attempt at using SIMD so constructive comments welcome - any one from Intel reading ?

Please excuse the gross use of prefetches! I just threw them in there (they ARE needed)!

I'm SURE these are sub-optimal.

Compiled with Intel's excellent Performance Compiler V5.0 30 day trial version from Intel's Website (boy does this produce fast code even with plain, vanilla C!!)

bye

Rob James

 ```/******************************************************** Perlin Noise2 and Noise3 functions Optimised for Intel SIMD (PIII CeleronII) Instruction Set.SIMD versions process 4 input vectors at once.SIMD versions run more than twice as fast :)This is my first attempt at using SIMD so constructive comments welcome - any one from Intel reading ? Please excuse the gross use of prefetches! I just threw them in there (they ARE needed)!I'm SURE these are sub-optimal.Compiled with Intel's excellent Performance Compiler V5.0 30 day trial version from Intel's Website (Boy does this produce fast with vanilla C!!)Rob Jamesemail robjames@europe.com home http://www.pocketmoon.com April 2001******************************************************/#include #include #include #include #include #include // intel simd header //*** Note use of floats *** //SIMD is basically 4 floats at once (or 2 doubles...) typedef struct {float x; float y; float z;} Vector; // Standard Perlin 'optimised' noise functions void Noise2 (Vector vec1,float *result); void Noise3 (Vector vec1,float *result); // Intel SIMD (PIII or CeleronII) optimised versions void Noise24 (Vector vec1,Vector vec2,Vector vec3,Vector vec4,float *result); void Noise34 (Vector vec1,Vector vec2,Vector vec3,Vector vec4,float *result);//to initialise the 'noise' lookup arrays void PInitNoise();//test functions for profiling void tnoise2(); void tnoise3(); //from standard Noise functions #define B 0x100 #define BM 0xff //#define N 0x100000 #define N ((float)1048576.0) #define NP 12 /* 2^N */ #define NM 0xfff int p[B + B + 2];//alignment required for SIMD intrinsics to work _declspec (align(32)) float g2[B + B + 2][2];//CHANGED FROM DOUBLE TO FLOAT and 3 to 2// _declspec (align(32)) float g3[B + B + 2][3];//CHANGED FROM DOUBLE TO FLOAT and 3 to 2// /* cubic spline interpolation */ #define s_curve(t) ( t * t * (3. - 2. * t) )/* linear interpolation */ #define lerp(t, a, b) ( a + t * (b - a) )// For noise3 #define at3(rx,ry,rz) ( rx * q[0] + ry * q[1] + rz * q[2] )//for noise2 #define at2(rx,ry) ( rx * q[0] + ry * q[1] )#define setup(u,b0,b1,r0,r1)\ t = u + N;\ b0 = ((int)t) & BM;\ b1 = (b0+1) & BM;\ r0 = t - (int)t;\ r1 = r0 - 1.;// anyone who's used Procedural noise should recognise these two functiomns ... void Noise3(Vector vec, float*result) { int bx0, bx1, by0, by1, bz0, bz1, b00, b10, b01, b11; float rx0, rx1, ry0, ry1, rz0, rz1, *q, sx, sy, sz; float a, b, c, d, u, v,t; int i, j; setup(vec.x, bx0,bx1, rx0,rx1); setup(vec.y, by0,by1, ry0,ry1); setup(vec.z, bz0,bz1, rz0,rz1); i = p[ bx0 ]; j = p[ bx1 ]; b00 = p[ i + by0 ]; b10 = p[ j + by0 ]; b01 = p[ i + by1 ]; b11 = p[ j + by1 ]; sx = s_curve(rx0); sy = s_curve(ry0); sz = s_curve(rz0); q = g3[ b00 + bz0 ]; u = at3(rx0,ry0,rz0); q = g3[ b10 + bz0 ]; v = at3(rx1,ry0,rz0); a = lerp(sx, u, v); q = g3[ b01 + bz0 ]; u = at3(rx0,ry1,rz0); q = g3[ b11 + bz0 ]; v = at3(rx1,ry1,rz0); b = lerp(sx, u, v); c = lerp(sy, a, b); /* interpolate in y at lo z */ q = g3[ b00 + bz1 ]; u = at3(rx0,ry0,rz1); q = g3[ b10 + bz1 ]; v = at3(rx1,ry0,rz1); a = lerp(sx, u, v); q = g3[ b01 + bz1 ]; u = at3(rx0,ry1,rz1); q = g3[ b11 + bz1 ]; v = at3(rx1,ry1,rz1); b = lerp(sx, u, v); d = lerp(sy, a, b); /* interpolate in y at hi z */ *result = 1.5 * lerp(sz, c, d); /* interpolate in z */} void Noise2(Vector vec, float *result) { int bx0, bx1, by0, by1, b00, b10, b01, b11; float rx0, rx1, ry0, ry1, *q, sx, sy, a, b, u, v,t; int i, j; setup(vec.x, bx0,bx1, rx0,rx1); setup(vec.y, by0,by1, ry0,ry1); i = p[ bx0 ]; j = p[ bx1 ]; b00 = p[ i + by0 ]; b10 = p[ j + by0 ]; b01 = p[ i + by1 ]; b11 = p[ j + by1 ]; sx = s_curve(rx0); sy = s_curve(ry0); q = g2[ b00 ]; /* get random gradient */ u = at2(rx0,ry0); /* get weight on lo x side (lo y) */ q = g2[ b10 ]; v = at2(rx1,ry0); /* get weight on hi x side (lo y) */ a = lerp(sx, u, v); /* get value at distance sx between u & v */ /* similarly at hi y... */ q = g2[ b01 ]; u = at2(rx0,ry1); q = g2[ b11 ]; v = at2(rx1,ry1); b = lerp(sx, u, v); *result = 1.5 * lerp(sy, a, b); /* interpolate in y */ } // Now my 4-at-a-time functions using Intels SIMD intrinsics... void Noise24( Vector vec1, Vector vec2, Vector vec3, Vector vec4, float *result)/* MUST BE ALIGN(16)*/ { int b000, b001, b002, b003, b100, b101, b102, b103, b010, b011, b012, b013, b110, b111, b112, b113; int bx00, bx01, bx02, bx03, bx10, bx11, bx12, bx13, by00, by01, by02, by03, by10, by11, by12, by13; float tx0,tx1,tx2,tx3, ty0,ty1,ty2,ty3; // first sight of an intrinsic! __m128 rx0, rx1, ry0, ry1,tmp,tmp2, q0,q1; __m128 sx, sy, a, b, u, v , two, three; int i0, i1, i2, i3, j1,j2,j3,j0; // put (vec.x + 0x100000) into a temp var ?? // ISSUE : bx0 etc are ints converted from floats // this is possible using __m64 _mm_cvtps_pi16( __m128 a) (composite) // but would entail using MMX (swich penalty) hmmm... // // When mixing MMX and SIMD we need to do a mm_empty - this could be slow // maybe not needed when using convert ??? // #define MM 1048576.0 int itx1, itx2, itx3, itx0, ity1, ity2, ity3, ity0;/* __m64 _mm_cvtps_pi16( __m128 a) (composite) Convert the four single precision FP values in a to four signed 16-bit integer values. r0 := (short)a0 r1 := (short)a1 r2 := (short)a2 r3 := (short)a3 */ tx0 = (vec1.x + MM); tx1 = (vec2.x + MM) ; tx2 = (vec3.x + MM) ; tx3 = (vec4.x + MM) ; itx0 = (int)(tx0); itx1 = (int)(tx1); itx2 = (int)(tx2); itx3 = (int)(tx3); bx00 = itx0 & 0xff; bx10 = (bx00+1) & 0xff; bx01 = itx1 & 0xff; bx11 = (bx01+1) & 0xff; bx02 = itx2 & 0xff; bx12 = (bx02+1) & 0xff; bx03 = itx3 & 0xff; bx13 = (bx03+1) & 0xff; ty0 = (vec1.y + MM) ; ity0 = (int)(ty0);//-0.5 by00 = ity0 & 0xff; by10 = (by00+1) & 0xff; ty1 = (vec2.y + MM) ; ity1 = (int)(ty1);//-.5 by01 = ity1 & 0xff; by11 = (by01+1) & 0xff; ty2 = (vec3.y + MM) ; ity2 = (int)(ty2);//-0.5 by02 = ity2 & 0xff; by12 = (by02+1) & 0xff; ty3 = (vec4.y + MM) ; ity3 = (int)(ty3);//-0.5 by03 = ity3 & 0xff; by13 = (by03+1) & 0xff; // could user post incrementing pointers here... i0 = p[ bx00 ]; j0 = p[ bx10 ]; i1 = p[ bx01 ]; j1 = p[ bx11 ]; i2 = p[ bx02 ]; j2 = p[ bx12 ]; i3 = p[ bx03 ]; j3 = p[ bx13 ]; b000 = p[ i0 + by00 ]; b010 = p[ i0 + by10 ]; b100 = p[ j0 + by00 ]; b110 = p[ j0 + by10 ]; b001 = p[ i1 + by01 ]; b011 = p[ i1 + by11 ]; b101 = p[ j1 + by01 ]; b111 = p[ j1 + by11 ]; b002 = p[ i2 + by02 ]; b012 = p[ i2 + by12 ]; b102 = p[ j2 + by02 ]; b112 = p[ j2 + by12 ]; b003 = p[ i3 + by03 ]; b013 = p[ i3 + by13 ]; b103 = p[ j3 + by03 ]; b113 = p[ j3 + by13 ]; //!! try using FAST floattoint ... - tried it, slower :) rx0 = _mm_set_ps( tx0 - itx0, tx1 - itx1, tx2 - itx2, tx3 - itx3); // may be faster as another mm_set_ps with -1 at the end of each term tmp = _mm_set_ps1(1.0 ) ; rx1 = _mm_sub_ps(rx0, tmp); ry0 = _mm_set_ps( ty0 - ity0, ty1 - ity1, ty2 - ity2, ty3 - ity3); ry1 = _mm_sub_ps (ry0 ,tmp); //Now, these prefetches make a HUGE difference. //Without them the whole function is memory bound and no faster //than the original functions // //I have applied no real science in placement of these prefetches //I just chuck them in a few statements before I need them! _mm_prefetch((char*)&(g2[b000]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b001]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b002]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b003]),_MM_HINT_NTA); //you will now see original code as comments followed by simd equivalent... // sx = rx0 * rx0 * (3.0 - 2.0 * rx0); two = _mm_set_ps1 (2.0) ; // Might be good to PRESTORE the constant arrays 1.0, 2.0 etc sx = _mm_mul_ps (rx0,two); three = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sx); tmp = _mm_mul_ps (tmp2,rx0); sx = _mm_mul_ps (tmp, rx0); _mm_prefetch((char*)&(g2[b100]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b101]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b102]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b103]),_MM_HINT_NTA); //sy = ry0 * ry0 * (3.0 - 2.0 * ry0); //tmp = _mm_set_ps1 (2.0) ; sy = _mm_mul_ps (ry0,two); //tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sy); tmp = _mm_mul_ps (tmp2,ry0); sy = _mm_mul_ps (tmp, ry0); _mm_prefetch((char*)&(g2[b010]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b011]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b012]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b013]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b110]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b111]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b112]),_MM_HINT_NTA); _mm_prefetch((char*)&(g2[b113]),_MM_HINT_NTA); // q = g2[ b00 ]; /* get random gradient */ // u = rx0 * q[0] + ry0 * q[1]; q0 = _mm_set_ps(g2[b000][0], g2[b001][0], g2[b002][0], g2[b003][0] ); q1 = _mm_set_ps(g2[b000][1], g2[b001][1], g2[b002][1], g2[b003][1] ); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); u = _mm_add_ps(tmp, tmp2); // q = g2[ b10 ]; // v = rx1 * q[0] + ry0 * q[1] ; q0 = _mm_set_ps(g2[b100][0], g2[b101][0], g2[b102][0], g2[b103][0] ); q1 = _mm_set_ps(g2[b100][1], g2[b101][1], g2[b102][1], g2[b103][1] ); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); v = _mm_add_ps(tmp, tmp2); // a = u + sx * (v-u); /* get value at distance sx between u & v */ tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2); // q = g2[ b01 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g2[b010][0], g2[b011][0], g2[b012][0], g2[b013][0] ); q1 = _mm_set_ps(g2[b010][1], g2[b011][1], g2[b012][1], g2[b013][1] ); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); u = _mm_add_ps(tmp, tmp2); // q = g2[ b11 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g2[b110][0], g2[b111][0], g2[b112][0], g2[b113][0] ); q1 = _mm_set_ps(g2[b110][1], g2[b111][1], g2[b112][1], g2[b113][1] ); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); v = _mm_add_ps(tmp, tmp2); // b = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2);// result = 1.5 * (a + sy * (b-a)) /* interpolate in y */ why 1.5 ?? tmp = _mm_sub_ps(b,a); tmp2 = _mm_mul_ps(tmp,sy); tmp = _mm_add_ps(tmp2,a); tmp2 = _mm_set_ps1(1.5); a = _mm_mul_ps(tmp2,tmp); _mm_store_ps(result, a ); // aligned //phew!! // As you see doing things 4-at-a-time does take some getting used to! } void Noise34(Vector vec1, Vector vec2, Vector vec3, Vector vec4, float *result)/* MUST BE ALIGN(16)*/ { // same as Noise24 but with the extra Z component included... int b000, b001, b002, b003, b100, b101, b102, b103, b010, b011, b012, b013, b110, b111, b112, b113; int bx00, bx01, bx02, bx03, bx10, bx11, bx12, bx13, by00, by01, by02, by03, by10, by11, by12, by13, bz00, bz01, bz02, bz03, bz10, bz11, bz12, bz13; float tx0,tx1,tx2,tx3, ty0,ty1,ty2,ty3, tz0,tz1,tz2,tz3; __m128 rx0, rx1, ry0, ry1, rz0, rz1, tmp,tmp2,tmp3,tmp4, q0,q1,q2; __m128 sx, sy, sz, a, b,c ,d, u, v , two, three; int i0, i1, i2, i3, j1,j2,j3,j0; // put (vec.x + 0x100000) into a temp var ?? // ISSUE : bx0 etc are ints converted from floats // this is possible using __m64 _mm_cvtps_pi16( __m128 a) (composite) // but would entail using MMX hmmm // // When mixing MMX and SIMD we need to do a mm_empty - this could be slow // maybe not needed when using convert ??? // #define MM 1048576.0 int itx1, itx2, itx3, itx0, ity1, ity2, ity3, ity0, itz1, itz2, itz3, itz0;/* __m64 _mm_cvtps_pi16( __m128 a) (composite) Convert the four single precision FP values in a to four signed 16-bit integer values. r0 := (short)a0 r1 := (short)a1 r2 := (short)a2 r3 := (short)a3 */ tx0 = (vec1.x + MM); tx1 = (vec2.x + MM) ; tx2 = (vec3.x + MM) ; tx3 = (vec4.x + MM) ; itx0 = (int)(tx0); itx1 = (int)(tx1); itx2 = (int)(tx2); itx3 = (int)(tx3); bx00 = itx0 & 0xff; bx10 = (bx00+1) & 0xff; bx01 = itx1 & 0xff; bx11 = (bx01+1) & 0xff; bx02 = itx2 & 0xff; bx12 = (bx02+1) & 0xff; bx03 = itx3 & 0xff; bx13 = (bx03+1) & 0xff; ty0 = (vec1.y + MM) ; ity0 = (int)(ty0); by00 = ity0 & 0xff; by10 = (by00+1) & 0xff; ty1 = (vec2.y + MM) ; ity1 = (int)(ty1); by01 = ity1 & 0xff; by11 = (by01+1) & 0xff; ty2 = (vec3.y + MM) ; ity2 = (int)(ty2); by02 = ity2 & 0xff; by12 = (by02+1) & 0xff; ty3 = (vec4.y + MM) ; ity3 = (int)(ty3); by03 = ity3 & 0xff; by13 = (by03+1) & 0xff; // ZED tz0 = (vec1.z + MM) ; itz0 = (int)(tz0); bz00 = itz0 & 0xff; bz10 = (bz00+1) & 0xff; tz1 = (vec2.z + MM) ; itz1 = (int)(tz1); bz01 = itz1 & 0xff; bz11 = (bz01+1) & 0xff; tz2 = (vec3.z + MM) ; itz2 = (int)(tz2); bz02 = itz2 & 0xff; bz12 = (bz02+1) & 0xff; tz3 = (vec4.z + MM) ; itz3 = (int)(tz3); bz03 = itz3 & 0xff; bz13 = (bz03+1) & 0xff; // could user post incrementing pointers here... i0 = p[ bx00 ]; j0 = p[ bx10 ]; i1 = p[ bx01 ]; j1 = p[ bx11 ]; i2 = p[ bx02 ]; j2 = p[ bx12 ]; i3 = p[ bx03 ]; j3 = p[ bx13 ]; b000 = p[ i0 + by00 ]; b010 = p[ i0 + by10 ]; b100 = p[ j0 + by00 ]; b110 = p[ j0 + by10 ]; b001 = p[ i1 + by01 ]; b011 = p[ i1 + by11 ]; b101 = p[ j1 + by01 ]; b111 = p[ j1 + by11 ]; b002 = p[ i2 + by02 ]; b012 = p[ i2 + by12 ]; b102 = p[ j2 + by02 ]; b112 = p[ j2 + by12 ]; b003 = p[ i3 + by03 ]; b013 = p[ i3 + by13 ]; b103 = p[ j3 + by03 ]; b113 = p[ j3 + by13 ]; // may be faster as another mm_set_ps with -1 at the end of each term tmp = _mm_set_ps1(1.0) ; //!! try using FAST floattoint ...??? rx0 = _mm_set_ps(tx0 - itx0, tx1 - itx1, tx2 - itx2, tx3 - itx3); rx1 = _mm_sub_ps(rx0, tmp); ry0 = _mm_set_ps(ty0 - ity0, ty1 - ity1, ty2 - ity2, ty3 - ity3); ry1 = _mm_sub_ps (ry0 ,tmp); rz0 = _mm_set_ps( tz0 - itz0, tz1 - itz1, tz2 - itz2, tz3 - itz3); rz1 = _mm_sub_ps (rz0 ,tmp); _mm_prefetch((char*)&(g3[b000+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b001+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b002+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b003+bz03]),_MM_HINT_NTA); // sx = rx0 * rx0 * (3.0 - 2.0 * rx0); two = _mm_set_ps1 (2.0) ; // Might be good to PRESTORE the constant arrays 1.0, 2.0 etc sx = _mm_mul_ps (rx0,two); three = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sx); tmp = _mm_mul_ps (tmp2,rx0); sx = _mm_mul_ps (tmp, rx0); _mm_prefetch((char*)&(g3[b100+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b101+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b102+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b103+bz03]),_MM_HINT_NTA); //sy = ry0 * ry0 * (3.0 - 2.0 * ry0); //tmp = _mm_set_ps1 (2.0) ; sy = _mm_mul_ps (ry0,two); //tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sy); tmp = _mm_mul_ps (tmp2,ry0); sy = _mm_mul_ps (tmp, ry0); _mm_prefetch((char*)&(g3[b010+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b011+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b012+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b013+bz03]),_MM_HINT_NTA); //sz = rz0 * rz0 * (3.0 - 2.0 * rz0); //tmp = _mm_set_ps1 (2.0) ; sz = _mm_mul_ps (rz0,two); //tmp = _mm_set_ps1 (3.0); tmp2 = _mm_sub_ps (three,sz); tmp = _mm_mul_ps (tmp2,rz0); sz = _mm_mul_ps (tmp, rz0); _mm_prefetch((char*)&(g3[b110+bz00]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b111+bz01]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b112+bz02]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b113+bz03]),_MM_HINT_NTA); // q = g3[ b00 ]; /* get random gradient */ // u = rx0 * q[0] + ry0 * q[1] + rz0 * q[2]; q0 = _mm_set_ps(g3[b000+bz00][0], g3[b001+bz01][0], g3[b002+bz02][0], g3[b003+bz03][0] ); q1 = _mm_set_ps(g3[b000+bz00][1], g3[b001+bz01][1], g3[b002+bz02][1], g3[b003+bz03][1] ); q2 = _mm_set_ps(g3[b000+bz00][2], g3[b001+bz01][2], g3[b002+bz02][2], g3[b003+bz03][2] ); tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3); _mm_prefetch((char*)&(g3[b000+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b001+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b002+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b003+bz13]),_MM_HINT_NTA); // q = g3[ b10 + bz0 ]; // v = rx1 * q[0] + ry0 * q[1] ; q0 = _mm_set_ps(g3[b100+bz00][0], g3[b101+bz01][0], g3[b102+bz02][0], g3[b103+bz03][0] ); q1 = _mm_set_ps(g3[b100+bz00][1], g3[b101+bz01][1], g3[b102+bz02][1], g3[b103+bz03][1] ); q2 = _mm_set_ps(g3[b100+bz00][2], g3[b101+bz01][2], g3[b102+bz02][2], g3[b103+bz03][2] ); tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);// a = u + sx * (v-u); /* get value at distance sx between u & v */ tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2); _mm_prefetch((char*)&(g3[b100+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b101+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b102+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b103+bz13]),_MM_HINT_NTA); // q = g3[ b01 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b010+bz00][0], g3[b011+bz01][0], g3[b012+bz02][0], g3[b013+bz03][0] ); q1 = _mm_set_ps(g3[b010+bz00][1], g3[b011+bz01][1], g3[b012+bz02][1], g3[b013+bz03][1] ); q2 = _mm_set_ps(g3[b010+bz00][2], g3[b011+bz01][2], g3[b012+bz02][2], g3[b013+bz03][2] ); tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3); _mm_prefetch((char*)&(g3[b010+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b011+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b012+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b013+bz13]),_MM_HINT_NTA); // q = g3[ b11 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b110+bz00][0], g3[b111+bz01][0], g3[b112+bz02][0], g3[b113+bz03][0] ); q1 = _mm_set_ps(g3[b110+bz00][1], g3[b111+bz01][1], g3[b112+bz02][1], g3[b113+bz03][1] ); q2 = _mm_set_ps(g3[b110+bz00][2], g3[b111+bz01][2], g3[b112+bz02][2], g3[b113+bz03][2] ); tmp3 = _mm_mul_ps(q2, rz0); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);// b = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2); // c = a + sy * (a-b); tmp = _mm_sub_ps (b,a); tmp2 = _mm_mul_ps (tmp,sy); c = _mm_add_ps (a,tmp2);/*NEW */ _mm_prefetch((char*)&(g3[b110+bz10]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b111+bz11]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b112+bz12]),_MM_HINT_NTA); _mm_prefetch((char*)&(g3[b113+bz13]),_MM_HINT_NTA);// q = g3[ b00 + bz1 ]; // u = rx0*q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b000+bz10][0], g3[b001+bz11][0], g3[b002+bz12][0], g3[b003+bz13][0] ); q1 = _mm_set_ps(g3[b000+bz10][1], g3[b001+bz11][1], g3[b002+bz12][1], g3[b003+bz13][1] ); q2 = _mm_set_ps(g3[b000+bz10][2], g3[b001+bz11][2], g3[b002+bz12][2], g3[b003+bz13][2] ); tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3); // q = g3[ b10 + bz1 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b100+bz10][0], g3[b101+bz11][0], g3[b102+bz12][0], g3[b103+bz13][0] ); q1 = _mm_set_ps(g3[b100+bz10][1], g3[b101+bz11][1], g3[b102+bz12][1], g3[b103+bz13][1] ); q2 = _mm_set_ps(g3[b100+bz10][2], g3[b101+bz11][2], g3[b102+bz12][2], g3[b103+bz13][2] ); tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry0); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3);// a = u + sx * (v-u); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); a = _mm_add_ps (u,tmp2); // q = g3[ b01 + bz1 ]; // v = rx1 * q[0] + ry1 * q[1]; q0 = _mm_set_ps(g3[b010+bz10][0], g3[b011+bz11][0], g3[b012+bz12][0], g3[b013+bz13][0] ); q1 = _mm_set_ps(g3[b010+bz10][1], g3[b011+bz11][1], g3[b012+bz12][1], g3[b013+bz13][1] ); q2 = _mm_set_ps(g3[b010+bz10][2], g3[b011+bz11][2], g3[b012+bz12][2], g3[b013+bz13][2] ); tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx0); tmp4 = _mm_add_ps(tmp, tmp2); u = _mm_add_ps(tmp4, tmp3); q0 = _mm_set_ps(g3[b110+bz10][0], g3[b111+bz11][0], g3[b112+bz12][0], g3[b113+bz13][0] ); q1 = _mm_set_ps(g3[b110+bz10][1], g3[b111+bz11][1], g3[b112+bz12][1], g3[b113+bz13][1] ); q2 = _mm_set_ps(g3[b110+bz10][2], g3[b111+bz11][2], g3[b112+bz12][2], g3[b113+bz13][2] ); tmp3 = _mm_mul_ps(q2, rz1); tmp = _mm_mul_ps(q1, ry1); tmp2 = _mm_mul_ps(q0, rx1); tmp4 = _mm_add_ps(tmp, tmp2); v = _mm_add_ps(tmp4, tmp3); tmp = _mm_sub_ps (v,u); tmp2 = _mm_mul_ps (tmp,sx); b = _mm_add_ps (u,tmp2); tmp = _mm_sub_ps (b,a); tmp2 = _mm_mul_ps (tmp,sy); d = _mm_add_ps (a,tmp2); // result = 1.5 * (c + sz * (d-c)) /* interpolate in y */ why 1.5 ?? tmp = _mm_sub_ps(d,c); tmp2 = _mm_mul_ps(tmp,sz); tmp = _mm_add_ps(tmp2,c); tmp2 = _mm_set_ps1(1.5); a = _mm_mul_ps(tmp2,tmp); _mm_store_ps(result, a ); // aligned } #include "noisetab.c" void PInitNoise() { int i;//set up our two random number lookup tables //one for 2d noise the other for 3d... for (i= 0; i < B+B+2; ++i) { p[i]= p_precomputed[i]; g3[i][0] = g2[i][0] = g_precomputed[i][0]; g3[i][1] = g2[i][1] = g_precomputed[i][1]; g3[i][2] = g_precomputed[i][2]; } } void tnoise2 () {int x; Vector v;// some aligned storage for test reults _declspec (align(16)) float a[1000] ; _declspec (align(16)) float b[1000] ; v.x = (float)rand()/10000.0; v.y = (float)rand()/10000.0; v.z = (float)rand()/10000.0; //printf("%f %f %f :",v.x,v.y,v.z); for (x=0;x<1000;x++) Noise2(v, &(a[x])); for (x=0;x<1000;x+=4) Noise24(v,v,v,v, &(b[x])); //printf("%f %f %f \n",a[0]-b[0],a[456]-b[456],a[999]-b[999]); }void tnoise3 () {_declspec (align(16)) float a[1000] ; _declspec (align(16)) float b[1000] ;int x; Vector v; v.x = (float)rand()/10000.0; v.y = (float)rand()/10000.0; v.z = (float)rand()/10000.0;// printf("%f %f %f\t:",v.x,v.y,v.z); for (x=0;x<1000;x++) Noise3(v, &(a[x])); for (x=0;x<1000;x+=4) Noise34(v, v, v, v, &(b[x]));// printf("%f %f %f %f\n",a[0],b[0],a[999],b[999]); } //Almost there... int main(int argc, char** argv) { int x; PInitNoise(); for (x=0;x < 1000;x++) { tnoise2(); tnoise3(); } } ```

The zip file viewer built into the Developer Toolbox made use of the zlib library, as well as the zlibdll source additions.