Optimize SSE-Code?!

Optimize SSE-Code?!

Here's my own function for Normalizing a vector:

c3DVector& OptimizedSSE_Vector3DNormalize(c3DVector &Out,const c3DVector &Vector)


const float *input=Vector.v;

const float *result=Out.v;

__declspec(align(16)) static const float Packedf0pt5[4] = {0.5f, 0.5f, 0.5f, 0.5f};

__declspec(align(16)) static const float Packedf3pt0[4] = {3.0f, 3.0f, 3.0f, 3.0f};



mov ebx,input

mov ecx,result

movss xmm0,[ebx+8] //store z, rest of xmm0 is cleared automatically

shufps xmm0,xmm0,0xCF //map: aa=11/bb=11/cc=00/dd=11

movlps xmm0,[ebx] //store x and y


movaps xmm5,Packedf0pt5

movaps xmm6,Packedf3pt0


movaps xmm2,xmm0 //save original vector to xmm2

mulps xmm0,xmm0 //xmm0: x*x/y*y/z*z/w*w

movaps xmm3,xmm0

movaps xmm4,xmm0

shufps xmm3,xmm3,0xC1 //after shuffle: y*y/x*x/x*x/w*w

shufps xmm4,xmm4,0xDA //after shuffle: z*z/z*z/y*y/w*w

addps xmm0,xmm3

addps xmm0,xmm4

rsqrtps xmm1,xmm0

//refine it, a=xmm0 / rsqrtps(a)=xmm1

//0.5*rsqrtps(a)*[3.0-[a * rsqrtps(a)] *rsqrtps(a)]

mulps xmm0,xmm1 //xmm0=[a * rsqrtps(a)]

mulps xmm0,xmm1 //xmm0=[a * rsqrtps(a)] *rsqrtps(a)

subps xmm6,xmm0 //xmm6=3.0-[a * rsqrtps(a)] *rsqrtps(a)

mulps xmm1,xmm6 //xmm1=rsqrtps(a)*[3.0-[a * rsqrtps(a)] *rsqrtps(a)]

mulps xmm1,xmm5 //xmm1=0.5*rsqrtps(a)*[3.0-[a * rsqrtps(a)] *rsqrtps(a)]

mulps xmm2,xmm1 //multiply it by original vector

movlps [ecx],xmm2 //write
x and y

shufps xmm2,xmm2,0xFE //map: aa=10/bb=11/cc=11/dd=11

movss [ecx+8],xmm2 //write z


return Out;


I hope, it's well documented. But it's not fast enough. How can I optimize this function?(I don't want to use a 4D-Vector)

1 post / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.