Hi:
I have write a stupid code to practiceSSE:
#include
#include
#include
#include /*SSE*/
#define MALLOC_ALIGN_16BYTE(_size) _aligned_malloc( _size, 16)
#define FREE_ALIGN_16BYTE(ptr) _aligned_free(ptr)
int main(void)
{
int inN;
short *input;
float *yIn;
inN = 10;
input = (short*)MALLOC_ALIGN_16BYTE(inN*sizeof(short));
yIn = (float*)MALLOC_ALIGN_16BYTE(inN*sizeof(float));
for(int i = 0; i< inN; i++){
input[i] = (short)i;
}/*for i*/
__m64 *pShort = (__m64*)input;
__m128 *pFloat = (__m128*)yIn;
int m = inN/4;
#if(1)
for(int i = 0; i< m; i++){
pFloat[i] = _mm_cvtpi16_ps(pShort[i]);
}/*for i*/
for(int i = m*4; i< inN;i++){
yIn[i] = (float)input[i];
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn[i] );
}/*for i*/
#else
for(int i = 0; i< inN;i++){
yIn[i] = (float)input[i];
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn[i] );
}/*for i*/
#endif
FREE_ALIGN_16BYTE(input);
FREE_ALIGN_16BYTE(yIn);
}/*main*/
the printf result is :
i = 0, yout = 0.000
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = -1.#IO
i = 9, yout = 9.000
if I set inN = 12; the result is
i = 0, yout = -1.#IO
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = 8.000
i = 9, yout = 9.000
i = 10, yout = 10.000
i = 11, yout = 11.000
They is both the same for VC8 or ICC10.1.
I do not know what mistake I make.... i am newbie in use SSE intrinsics.
could someone help me ?
thank you.


