OK, here is two functions,

one is SSE (P3/MMX et cetera), another is SSE2.

SSE2 is performing slower. Any advice why?

I've compiled with and without profile guided complation. results are the same. SSE2 is slower.

SSE2 function:

int SSE2_Copy16x16NA_E(BYTE* RESTRICT pSrc,BYTE* RESTRICT pDst,int w_Src,int w_Dst)

{

int i,result;

__m128i e=_mm_setzero_si128();

for (i=0;i<16;i++) {

__m128i unaligned=_mm_loadu_si128((__m128i*)pSrc);

e=_mm_add_epi16(e,_mm_sad_epu8(((__m128i*)pDst)[0],unaligned));

pDst+=w_Dst;

pSrc+=w_Src;

}

e = _mm_srli_si128(e, 8);

e = _mm_add_epi32 (e, e);

result=_mm_cvtsi128_si32(e);

_mm_empty();

return result;

}

SSE/MMX function:

int MMX_Copy16x16NA_E(BYTE* RESTRICT pSrc,BYTE* RESTRICT pDst,int w_Src,int w_Dst)

{

int i,result;

__m64 e0=0,e1=0;

for (i=0;i<16;i++) {

e0=_mm_add_pi32(e0,_mm_sad_pu8(((__m64*)pDst)[0],((__m64*)pSrc)[0]));

e1=_mm_add_pi32(e1,_mm_sad_pu8(((__m64*)pDst)[1],((__m64*)pSrc)[1]));

pDst+=w_Dst;

pSrc+=w_Src;

}

e1=_mm_add_pi32(e1,e0);

result=_m_to_int(e1);

_mm_empty();

return result;

}

Thanks,

Alex Telitsine

Streambox Inc.