Hallo,
I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization).
I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing?
Thanks.
Program is the following:
#include
#include
#include
#define SSE2_ALIGNED __attribute__ ((aligned (16)))
#define print_y(x) printf("%d ",(int) x)
#define print_n(x) x
#define print print_n
int D;
float *X,*Y;
inline static void dist(int i,int j)
{
float xd = X[i] - X[j];
float yd = Y[i] - Y[j];
print(rint(xd*xd + yd*yd));
}
inline static void dist_sse(int i)
{
float d[8] SSE2_ALIGNED;
int j;
__m128 xmm0 = _mm_set1_ps(X[i]);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y[i]);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j {
xmm4 =_mm_load_ps(X+j);
xmm5 =_mm_load_ps(X+j+4);
xmm6 =_mm_load_ps(Y+j);
xmm7 =_mm_load_ps(Y+j+4);
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm5 = _mm_add_ps(xmm5,xmm7);
_mm_store_ps(d,xmm4);
_mm_store_ps(d+4,xmm5);
print(rint(d[0]));
print(rint(d[1]));
print(rint(d[2]));
print(rint(d[3]));
print(rint(d[4]));
print(rint(d[5]));
print(rint(d[6]));
print(rint(d[7]));
}
}
int main(int argc, char * argv[])
{
int i,j,opc;
if ( argc != 3 )
{
fprintf(stderr," Usage: %s ",argv[0]);
return 1;
}
opc = atoi(argv[1]);
D = atoi(argv[2]);
if ( D %8 != 0 )
{
fprintf(stderr," Dimension %d must be multiple of 8: ",D);
return 2;
}
if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
}
for(i=0;i {
X[i] = i;
Y[i] = D - i;
}
if ( opc == 0 )
for(i=0;i for(j=0;j dist(i,j);
else
for(i=0;i dist_sse(i);
return 0;
}
I am compiling with:
CC = gccCFLAGS = -O3 -Wall -march=pentium-m -msse2
all: kk
And generated assembler is:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "12Usage: %s 1212"
.align 4
LC1:
.ascii "12Dimension %d must be multiple of 8: 1212"
.text
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %ea x
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $108, %esp
movl 12(%ebp), %ebx
andl $-16, %esp
call __alloca
call ___main
cmpl $3, 8(%ebp)
je L2
call ___getreent
movl (%ebx), %esi
movl $LC0, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl $1, %eax
leal -12(%ebp), %esp
L90:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L2:
movl 4(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, %edi
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
testb $7, %al
movl %eax, %ecx
jne L82
testl %edi, %edi
je L83
xorl %edx, %edx
sall $2, %eax
jne L84
L7:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L85
L12:
movl %edx, _Y
movl %edx, %ebx
L5:
xorl %edx, %edx
cmpl %ecx, %edx
jge L59
movl _X, %esi
.p2align 4,,15
L19:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L19
L59:
testl %edi, %edi
jne L20
xorl %esi, %esi
cmpl %ecx, %esi
jge L30
.p2align 4,,15
L92:
xorl %ebx, %ebx
cmpl %ecx, %ebx
jge L63
.p2align 4,,15
L91:
movl _X, %edx
movl _Y, %edi
flds (%edx,%ebx,4)
flds (%edi,%ebx,4)
fxch %st(1)
incl %ebx
fsubrs (%edx,%esi,4)
fxch %st(1)
fsubrs (%edi,%esi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
faddp %st, %st(1)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %ebx
jl L91
L63:
incl %esi
cmpl %ecx, %esi
jl L92
L30:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L83:
leal 0(,%eax,4), %eax
movl %eax, (%esp)
call _malloc
movl %eax, _X
movl _D, %esi
sall $2, %esi
movl %esi, (%esp)
call _malloc
movl %eax, _Y
movl _D, %ecx
movl %eax, %ebx
jmp L5
L20:
xorl %edi, %edi
cmpl %ecx, %edi
jge L30
.p2align 4,,15
L75:
movl _X, %edx
movl (%edx,%edi,4), %eax
movl %eax, -60(%ebp)
movl (%ebx,%edi,4), %esi
movss -60(%ebp), %xmm2
movl %esi, -64(%ebp)
xorl %esi, %esi
shufps $0, %xmm2, %xmm2
movss -64(%ebp), %xmm4
cmpl %ecx, %esi
movaps %xmm2, -88(%ebp)
shufps $0, %xmm4, %xmm4
movaps %xmm4, -104(%ebp)
jl L76
jmp L66
.p2align 4,,7
L67:
movl _X, %edx
movl _Y, %ebx
L76:
movaps -88(%ebp), %xmm7
leal 0(,%esi,4), %ecx
leal (%ecx,%edx), %edx
movaps (%edx), %xmm5
addl %ebx, %ecx
addl $8, %esi
movaps (%ecx), %xmm6
movaps -104(%ebp), %xmm4
subps %xmm5, %xmm7
movaps %xmm7, %xmm5
movaps 16(%edx), %xmm3
mulps %xmm5, %xmm5
subps %xmm6, %xmm4
movaps 16(%ecx), %xmm1
movaps %xmm4, %xmm7
mulps %xmm4, %xmm7
movaps -88(%ebp), %xmm2
addps %xmm7, %xmm5
movaps -104(%ebp), %xmm0
movaps %xmm5, -56(%ebp)
flds -56(%ebp)
subps %xmm3, %xmm2
subps %xmm1, %xmm0
movaps %xmm2, %xmm3
movaps %xmm0, %xmm6
mulps %xmm2, %xmm3
mulps %xmm0, %xmm6
addps %xmm6, %xmm3
movaps %xmm3, -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -52(%ebp)
fstpl (%e sp)
call _rint
fstp %st(0)
flds -48(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -44(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -36(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -32(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -28(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %esi
jl L67
L66:
incl %edi
cmpl %ecx, %edi
jge L30
movl _Y, %ebx
jmp L75
L85:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L78
leal 16(%eax), %ecx
andl $-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L78:
movl _D, %ecx
jmp L12
L84:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L77
leal 16(%eax), %ebx
andl $-16, %ebx
movl %ebx, %edx
movl %eax, -4(%ebx)
L77:
movl _D, %ecx
jmp L7
L82:
call ___getreent
movl _D, %ecx
movl $LC1, %edx
movl %edx, 4(%esp)
movl %ecx, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl $2, %eax
leal -12(%ebp), %esp
jmp L90
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16 # 4
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _rint; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef



