some problem about sse

some problem about sse

i want to know why my sse progam slower than ansic progam, i'm so puzzle.
code as follow

#include
#include
#include
typedef _declspec(align(16)) float vec3_t[3]; // 3???
inline void vec_normalize_sse(vec3_t vec)
{
_asm {
mov esi, vec
movups xmm0, [esi]
movups xmm1, xmm0
mulps xmm1, xmm1

movups xmm2, xmm1
shufps xmm2, xmm1, 0xe1
movups xmm3, xmm1
shufps xmm3, xmm1, 0xc6
addps xmm1, xmm2
addps xmm1, xmm3
shufps xmm1, xmm1, 0x00
sqrtps xmm1, xmm1
divps xmm0, xmm1

movups [esi], xmm0
}
}
inline void vec_normalize_c(vec3_t vec)
{
float len;
len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
len = (float)sqrt(len);
len = 1.0f/len;
vec[0] *= len;
vec[1] *= len;
vec[2] *= len;
}
int main()
{
int i, s, e, count;
vec3_t vec;
count = 1000000;
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec[0] += 0.1f;
vec[1] += 0.1f;
vec[2] += 0.1f;
vec_normalize_sse(vec);
}
e = clock();
printf("sse = %d, %f, %f, %f
", e - s, vec[0], vec[1], vec[2]);

vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec[0] += 0.1f;
vec[1] += 0.1f;
vec[2] += 0.1f;
vec_normalize_c(vec);
}
e = clock();
printf("c = %d, %f, %f, %f
", e - s, vec[0], vec[1], vec[2]);
getch();
return 0;
}

1 post / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.