# (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

## (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Hallo,

I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization).

I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing?

Thanks.

Program is the following:

```#include #include #include #include #define SSE2_ALIGNED __attribute__ ((aligned (16)))#define print_y(x) printf("%d
",(int) x)#define print_n(x) x#define print print_nint D;float *X,*Y;inline static void dist(int i,int j){  float xd = X[i] - X[j];  float yd = Y[i] - Y[j];  print(rint(xd*xd + yd*yd));}inline static void dist_sse(int i){  float d[8] SSE2_ALIGNED;  int j;  __m128 xmm0 = _mm_set1_ps(X[i]);  __m128 xmm1 = xmm0;  __m128 xmm2 = _mm_set1_ps(Y[i]);  __m128 xmm3 = xmm2;  __m128 xmm4,xmm5,xmm6,xmm7;  for(j=0; j  {    xmm4 =_mm_load_ps(X+j);    xmm5 =_mm_load_ps(X+j+4);    xmm6 =_mm_load_ps(Y+j);    xmm7 =_mm_load_ps(Y+j+4);    xmm4 = _mm_sub_ps(xmm0,xmm4);    xmm5 = _mm_sub_ps(xmm1,xmm5);    xmm6 = _mm_sub_ps(xmm2,xmm6);    xmm7 = _mm_sub_ps(xmm3,xmm7);    xmm4 = _mm_mul_ps(xmm4,xmm4);    xmm5 = _mm_mul_ps(xmm5,xmm5);    xmm6 = _mm_mul_ps(xmm6,xmm6);    xmm7 = _mm_mul_ps(xmm7,xmm7);    xmm4 = _mm_add_ps(xmm4,xmm6);    xmm5 = _mm_add_ps(xmm5,xmm7);    _mm_store_ps(d,xmm4);    _mm_store_ps(d+4,xmm5);    print(rint(d[0]));    print(rint(d[1]));    print(rint(d[2]));    print(rint(d[3]));    print(rint(d[4]));    print(rint(d[5]));    print(rint(d[6]));    print(rint(d[7]));  }}int main(int argc, char * argv[]){  int i,j,opc;  if ( argc != 3 )  {    fprintf(stderr,"
Usage: %s

",argv[0]);    return 1;   }  opc = atoi(argv[1]);  D = atoi(argv[2]);  if ( D %8 != 0 )  {    fprintf(stderr,"
Dimension %d must be multiple of 8:

",D);    return 2;   }  if ( opc == 0 )  {    X = (float *) malloc(D * sizeof(float));    Y = (float *) malloc(D * sizeof(float));  }  else  {    X = (float *) _mm_malloc(D * sizeof(float), 16);    Y = (float *) _mm_malloc(D * sizeof(float), 16);  }  for(i=0;i  {    X[i] = i;    Y[i] = D - i;  }  if ( opc == 0 )    for(i=0;i      for(j=0;j        dist(i,j);  else    for(i=0;i      dist_sse(i);  return 0;}```

I am compiling with:

`CC = gccCFLAGS = -O3 -Wall -march=pentium-m -msse2all: kk`

And generated assembler is:

```	.file	"kk.c"	.def	___main;	.scl	2;	.type	32;	.endef	.section .rdata,"dr"LC0:	.ascii "12Usage: %s  1212"	.align 4LC1:	.ascii "12Dimension %d must be multiple of 8: 1212"	.text	.p2align 4,,15.globl _main	.def	_main;	.scl	2;	.type	32;	.endef_main:	pushl	%ebp	movl	\$16, %ea
x	movl	%esp, %ebp	pushl	%edi	pushl	%esi	pushl	%ebx	subl	\$108, %esp	movl	12(%ebp), %ebx	andl	\$-16, %esp	call	__alloca	call	___main	cmpl	\$3, 8(%ebp)	je	L2	call	___getreent	movl	(%ebx), %esi	movl	\$LC0, %ecx	movl	%ecx, 4(%esp)	movl	%esi, 8(%esp)	movl	12(%eax), %edx	movl	%edx, (%esp)	call	_fprintf	movl	\$1, %eax	leal	-12(%ebp), %espL90:	popl	%ebx	popl	%esi	popl	%edi	popl	%ebp	retL2:	movl	4(%ebx), %edi	movl	%edi, (%esp)	call	_atoi	movl	%eax, %edi	movl	8(%ebx), %eax	movl	%eax, (%esp)	call	_atoi	movl	%eax, _D	testb	\$7, %al	movl	%eax, %ecx	jne	L82	testl	%edi, %edi	je	L83	xorl	%edx, %edx	sall	\$2, %eax	jne	L84L7:	movl	%edx, _X	movl	%ecx, %eax	xorl	%edx, %edx	sall	\$2, %eax	jne	L85L12:	movl	%edx, _Y	movl	%edx, %ebxL5:	xorl	%edx, %edx	cmpl	%ecx, %edx	jge	L59	movl	_X, %esi	.p2align 4,,15L19:	movl	%ecx, %eax	cvtsi2ss	%edx, %xmm1	subl	%edx, %eax	cvtsi2ss	%eax, %xmm0	movss	%xmm1, (%esi,%edx,4)	movss	%xmm0, (%ebx,%edx,4)	incl	%edx	cmpl	%ecx, %edx	jl	L19L59:	testl	%edi, %edi	jne	L20	xorl	%esi, %esi	cmpl	%ecx, %esi	jge	L30	.p2align 4,,15L92:	xorl	%ebx, %ebx	cmpl	%ecx, %ebx	jge	L63	.p2align 4,,15L91:	movl	_X, %edx	movl	_Y, %edi	flds	(%edx,%ebx,4)	flds	(%edi,%ebx,4)	fxch	%st(1)	incl	%ebx	fsubrs	(%edx,%esi,4)	fxch	%st(1)	fsubrs	(%edi,%esi,4)	fxch	%st(1)	fmul	%st(0), %st	fxch	%st(1)	fmul	%st(0), %st	faddp	%st, %st(1)	fstpl	(%esp)	call	_rint	fstp	%st(0)	movl	_D, %ecx	cmpl	%ecx, %ebx	jl	L91L63:	incl	%esi	cmpl	%ecx, %esi	jl	L92L30:	leal	-12(%ebp), %esp	xorl	%eax, %eax	popl	%ebx	popl	%esi	popl	%edi	popl	%ebp	retL83:	leal	0(,%eax,4), %eax	movl	%eax, (%esp)	call	_malloc	movl	%eax, _X	movl	_D, %esi	sall	\$2, %esi	movl	%esi, (%esp)	call	_malloc	movl	%eax, _Y	movl	_D, %ecx	movl	%eax, %ebx	jmp	L5L20:	xorl	%edi, %edi	cmpl	%ecx, %edi	jge	L30	.p2align 4,,15L75:	movl	_X, %edx	movl	(%edx,%edi,4), %eax	movl	%eax, -60(%ebp)	movl	(%ebx,%edi,4), %esi	movss	-60(%ebp), %xmm2	movl	%esi, -64(%ebp)	xorl	%esi, %esi	shufps	\$0, %xmm2, %xmm2	movss	-64(%ebp), %xmm4	cmpl	%ecx, %esi	movaps	%xmm2, -88(%ebp)	shufps	\$0, %xmm4, %xmm4	movaps	%xmm4, -104(%ebp)	jl	L76	jmp	L66	.p2align 4,,7L67:	movl	_X, %edx	movl	_Y, %ebxL76:	movaps	-88(%ebp), %xmm7	leal	0(,%esi,4), %ecx	leal	(%ecx,%edx), %edx	movaps	(%edx), %xmm5	addl	%ebx, %ecx	addl	\$8, %esi	movaps	(%ecx), %xmm6	movaps	-104(%ebp), %xmm4	subps	%xmm5, %xmm7	movaps	%xmm7, %xmm5	movaps	16(%edx), %xmm3	mulps	%xmm5, %xmm5	subps	%xmm6, %xmm4	movaps	16(%ecx), %xmm1	movaps	%xmm4, %xmm7	mulps	%xmm4, %xmm7	movaps	-88(%ebp), %xmm2	addps	%xmm7, %xmm5	movaps	-104(%ebp), %xmm0	movaps	%xmm5, -56(%ebp)	flds	-56(%ebp)	subps	%xmm3, %xmm2	subps	%xmm1, %xmm0	movaps	%xmm2, %xmm3	movaps	%xmm0, %xmm6	mulps	%xmm2, %xmm3	mulps	%xmm0, %xmm6	addps	%xmm6, %xmm3	movaps	%xmm3, -40(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-52(%ebp)	fstpl	(%e
sp)	call	_rint	fstp	%st(0)	flds	-48(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-44(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-40(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-36(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-32(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	flds	-28(%ebp)	fstpl	(%esp)	call	_rint	fstp	%st(0)	movl	_D, %ecx	cmpl	%ecx, %esi	jl	L67L66:	incl	%edi	cmpl	%ecx, %edi	jge	L30	movl	_Y, %ebx	jmp	L75L85:	addl	\$16, %eax	movl	%eax, (%esp)	call	_malloc	testl	%eax, %eax	movl	%eax, %edx	je	L78	leal	16(%eax), %ecx	andl	\$-16, %ecx	movl	%ecx, %edx	movl	%eax, -4(%ecx)L78:	movl	_D, %ecx	jmp	L12L84:	addl	\$16, %eax	movl	%eax, (%esp)	call	_malloc	testl	%eax, %eax	movl	%eax, %edx	je	L77	leal	16(%eax), %ebx	andl	\$-16, %ebx	movl	%ebx, %edx	movl	%eax, -4(%ebx)L77:	movl	_D, %ecx	jmp	L7L82:	call	___getreent	movl	_D, %ecx	movl	\$LC1, %edx	movl	%edx, 4(%esp)	movl	%ecx, 8(%esp)	movl	12(%eax), %ebx	movl	%ebx, (%esp)	call	_fprintf	movl	\$2, %eax	leal	-12(%ebp), %esp	jmp	L90	.comm	_D, 16	 # 4	.comm	_X, 16	 # 4	.comm	_Y, 16	 # 4	.def	_atoi;	.scl	3;	.type	32;	.endef	.def	___getreent;	.scl	3;	.type	32;	.endef	.def	_fprintf;	.scl	3;	.type	32;	.endef	.def	_rint;	.scl	3;	.type	32;	.endef	.def	_malloc;	.scl	3;	.type	32;	.endef```

10 posts / 0 new
For more complete information about compiler optimizations, see our Optimization Notice.

http://gcc.gnu.org/ml/gcc-help/2008-04/msg00073.html
It was never clear why a major speedup was expected with printf() in the inner loop, nor whether a vectorizing compiler such as g++ 4.3 or icpc was tried (with printf removed so as to attempt auto-vectorization).
C99 math functions (e.g. rint) presumably are available in g++, with -std=gnu99.

Will read post ... but meanwhile can tell that the printf is conditionally removed from code. I used only for debugging the correctness of vectorized version, of course for the real timing, i removed.

thanks.

LOL

The post u mentioned is just the one I made first, before coming here to Intel forumns !!!

With "extensively covered" you mean the topic was left unanswered? ;-1 Cause thats what happened. I made all suggestions I got, and still no signigicant improvement. The modified version is the one I posted here, at Intel.

There you have the assembler generated by GCC ... there we can see the SSE2 instructions. So, that is wrong with this picture?!

I would expect that, if there is one place on earth with people being experts in Intel assembler, that place must be here ... hehe. So, please check the assembler and tell me what is wrong.

Thanks.

People on the gcc-help list did their best to help you define what you wanted to do. If you wanted only to make a non-vector cross between a macro and inline function of a style understood only by gcc, with parallel SSE inside that function, perhaps what you showed was what you wanted.
Otherwise, you could have posted a shorter example, which could be compiled by standard compilers, illustrating your interest.
The rint() function is not recognized as a vectorizable function by either gcc or icc. If you meant it as a substitute for sqrtf(), the latter can be vectorized in line by icc, but apparently not yet by gcc.

Certaingly I appreciate all the advices gcc-help community provide me. Indeed, I followed all their advices.

I was not asking for a way to achieve the vectorization ... I already did it. Neither was I looking to automatic vectorization ... I manually code it. We can see the assembler there, generated directly from intrinsics functions ... it includes SSE2 of course (thus, this is not a problem of whether the compiler automatically vectorized or not).

What I am asking for, kindly of course hehe, is for advice about why my manually coded vector version is not improving significantly the performance (it offers a gain of 10% in runtime, which is very poor considering I am vectorizing the whole thing ... I would expect a gain of 75%)

Thanks.

Hi,

Try changing the allocation scheme to allocating one aligned block for all the data instead of many allocations. This will improve data locality in the caches and might gain performance.

The code here uses rint(). This part is not parallelized and is similar for the two versions. I dont know what it does so its impossible to estimate its duration relative to the other operations. It might be another reason for the low speedup.

Regards

Just to add a few more comments to explain the likely reasons for your observation of lack of speed up.

In addition to what my colleagues have pointed out, on calling an external function like rint. The pitfalls in the code you've shown have significant overhead in terms calling scalar external functions. For the purpose of rounding, using another integer conversion technique might make more sense than throwing a bunch of rint's at the end of each hand-vectorized SSE loop. Secondly, depending on the parameter "D" you use when testing, it is possible in some portion of your loop iteration the rounding of floating-point to integer may experience exceptions. That can have different amount of delays between x87 code and SSE code.

I did a quick test by simplifying your code somewhat to use the intrinsic of cvtps2pi for rounding. I also modified the scalar c code into two versions to compare the overhead of using rint vs. a simple type cast conversion.

Using a fixed value of D= 1024 (10^6 scalar loops, each loop has 2 mul, 1 add, 2 subtract), and compiled with simply /O2 on an ICC and MSC,

the scalar distance calculation with rint took ~ 40 M cycles

a modified scalar calculation with type cast conversion took ~ 17 M cycles

the modified SSE code with _mm_cvtps_pi32 took ~ 5 M cycles.

There certainly will be variances with different compilers, using an external function like rint vs. type cast convert vs. intrinsic convert. The value of D you choose and the method of timing measurement will make further variance on your measured speedup.

For your reference the modifications I made is based on accumulating the results of each evaluation of

int dist(int i,int j)
{ float xd = X[i] - X[j];
float yd = Y[i] - Y[j];
int z = rint(xd*xd + yd*yd);
return z;
}

int distB(int i,int j) // compare overhead of rint
{ float xd = X[i] - X[j];
float yd = Y[i] - Y[j];
return (int) (xd*xd + yd*yd);
}

The SSE version includes replacing the rint with _mm_cvtps_pi32 and a bunch of _mm_add_pi32, so that the different loop structure of vectorized code have the same amount of add relative to the double-nested loop of scalar code and accumulated result.

Jalo,

You may be watching at an old version of program. Initial posts I put on gcc-help list, shown an AOS (Array Of StructureS). Due suggestions, I changed that to a SOA (Structure Of Arrays). I have now a few aligned arrays with all the data.

I am going to post new version, which included latests suggestions I received here. You may wanna take a look at it.

Thanks.

Jallo,

It seems that rint function entered a lot of noise ;-1 It was not an escencial part of my problem, so I removed for the sake of this proof of concept. I can see now a gain of 30% less in execution time. But still, I would expect more, given SSE2 is performing 4 operations at a time, right?

Unless maybe, I am loosing something ... as usual ;-|

Thanks for your attention (below c and assembler)

PS: Dunno why, but when used an if to repeat code for calling one function or the other, instead of a func pointer, the performance is worse than 30%. I know that using function pointer may prevent inlining the functions, so I tried to avoid its usage ... with the surprise that the SSE2 gain was less !. That's the reason I kept that part. Althought this may b an interesting point on its own, think thats another topic, not related with this serial vs vectorized debate.

#include
#include
#include

int D;
float *X,*Y,*Z;

inline static void dist(int i)
{
float xd,yd;
int j;
for(j=0; j {
xd = X[i] - X[j];
yd = Y[i] - Y[j];
Z[j] = xd*xd + yd*yd;
}
}

inline static void dist_sse(int i)
{
int j;
__m128 xmm0 = _mm_set1_ps(X[i]);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y[i]);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j {
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
_mm_store_ps(Z+j,xmm4);
_mm_store_ps(Z+j+4,xmm5);
}
}

int main(int argc, char * argv[])
{
int i,j,opc,debug;
void (*opc_func)(int);

if ( argc != 4 )
{
fprintf(stderr,"
Usage: %s

",argv[0]);
return 1;
}

opc = atoi(argv[1]);
D = atoi(argv[2]);
debug = atoi(argv[3]);

if ( D %8 != 0 )
{
fprintf(stderr,"
Dimension %d must be multiple of 8:

",D);
return 2;
}

if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
Z = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
Z = (float *) _mm_malloc(D * sizeof(float), 16);
}

for(i=0;i {
X[i] = i;
Y[i] = D - i;
}

opc_func = opc == 0? dist : dist_sse;
for(i=0;i {
opc_func(i);
if ( debug )
{
for(j=0; j printf("%f
",Z[j]);
}
}

return 0;
}

.file "kk.c"
.text
.p2align 4,,15
.def _dist; .scl 3; .type 32; .endef
_dist:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
pushl %edi
movl 8(%ebp), %edi
pushl %esi
movl _D, %esi
pushl %ebx
cmpl %esi, %eax
jge L7
movl _X, %ecx
movl _Y, %edx
movl _Z, %ebx
.p2align 4,,15
L5:
flds (%ecx,%eax,4)
flds (%edx,%eax,4)
fxch %st(1)
fsubrs (%ecx,%edi,4)
fxch %st(1)
fsubrs (%edx,%edi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
fstps (%ebx,%eax,4)
incl %eax
cmpl %esi, %eax
jl L5
L7:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,15
.def _dist_sse; .scl 3; .type 32; .endef
_dist_sse:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
subl \$8, %esp
movl 8(%ebp), %esi
movl _X, %ebx
movl _Y, %ecx
movl (%ebx,%esi,4), %edx
movl %edx, -12(%ebp)
movl (%ecx,%esi,4), %eax
xorl %es
i, %esi
movss -12(%ebp), %xmm5
movl %eax, -16(%ebp)
movss -16(%ebp), %xmm4
cmpl _D, %esi
shufps \$0, %xmm5, %xmm5
shufps \$0, %xmm4, %xmm4
jl L36
jmp L34
.p2align 4,,7
L35:
movl _X, %ebx
movl _Y, %ecx
L36:
leal 0(,%esi,4), %edx
movaps %xmm5, %xmm1
leal (%edx,%ebx), %eax
movaps (%eax), %xmm2
leal (%edx,%ecx), %ebx
movaps %xmm5, %xmm6
movaps (%ebx), %xmm0
movaps 16(%eax), %xmm3
subps %xmm2, %xmm1
movl _Z, %eax
movaps 16(%ebx), %xmm7
movaps %xmm1, %xmm2
movaps %xmm4, %xmm1
subps %xmm0, %xmm1
movl %edx, %ebx
movaps %xmm1, %xmm0
mulps %xmm2, %xmm2
mulps %xmm0, %xmm0
subps %xmm3, %xmm6
movaps %xmm6, %xmm3
movaps %xmm2, (%ebx)
movaps %xmm4, %xmm6
subps %xmm7, %xmm6
movl _Z, %ecx
movaps %xmm6, %xmm1
mulps %xmm3, %xmm3
mulps %xmm6, %xmm1
movaps %xmm3, 16(%edx)
cmpl _D, %esi
jl L35
L34:
popl %ebx
popl %esi
popl %ebp
ret
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.align 4
LC1:
.ascii "12Usage: %s 1212"
LC3:
.ascii "%f12"
.align 4
LC2:
.ascii "12Dimension %d must be multiple of 8: 1212"
.text
.p2alig
n 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl \$16, %eax
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl \$28, %esp
movl 12(%ebp), %ebx
andl \$-16, %esp
call __alloca
call ___main
cmpl \$4, 8(%ebp)
je L38
call ___getreent
movl (%ebx), %esi
movl \$LC1, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl \$1, %eax
leal -12(%ebp), %esp
L99:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
L38:
movl 4(%ebx), %edx
movl %edx, (%esp)
call _atoi
movl %eax, -16(%ebp)
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
movl 12(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, -20(%ebp)
movl _D, %ecx
testb \$7, %cl
jne L89
movl -16(%ebp), %edi
testl %edi, %edi
je L90
movl %ecx, %eax
xorl %edx, %edx
sall \$2, %eax
jne L91
L43:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall \$2, %eax
jne L92
L48:
movl %edx, _Y
movl %ecx, %eax
xorl %edx, %edx
sall \$2, %eax
jne&nb
sp; L93
L53:
movl %edx, _Z
L41:
xorl %edx, %edx
cmpl %ecx, %edx
jge L73
movl _X, %esi
movl _Y, %ebx
.p2align 4,,15
L60:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L60
L73:
movl -16(%ebp), %ebx
movl \$_dist, %edi
movl \$_dist_sse, %edx
testl %ebx, %ebx
cmovne %edx, %edi
xorl %esi, %esi
cmpl %ecx, %esi
jge L75
.p2align 4,,15
L101:
movl %esi, (%esp)
call *%edi
movl -20(%ebp), %ecx
testl %ecx, %ecx
je L77
movl _D, %eax
xorl %ebx, %ebx
cmpl %eax, %ebx
jge L65
.p2align 4,,15
L100:
movl _Z, %eax
flds (%eax,%ebx,4)
incl %ebx
movl \$LC3, (%esp)
fstpl 4(%esp)
call _printf
movl _D, %eax
cmpl %eax, %ebx
jl L100
L65:
incl %esi
cmpl %eax, %esi
L102:
jl L101
L75:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L90:
leal 0(,%ecx,4), %ebx
movl %ebx, (%esp)
call _malloc
movl %eax, _X
movl _D, %edx
sall \$2, %edx
movl %edx, (%esp)
call _malloc
movl %eax, _Y
&
nbsp; movl _D, %eax
sall \$2, %eax
movl %eax, (%esp)
call _malloc
movl %eax, _Z
movl _D, %ecx
jmp L41
.p2align 4,,7
L77:
movl _D, %eax
incl %esi
cmpl %eax, %esi
jmp L102
L93:
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L86
leal 16(%eax), %edi
andl \$-16, %edi
movl %edi, %edx
movl %eax, -4(%edi)
L86:
movl _D, %ecx
jmp L53
L92:
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L85
leal 16(%eax), %esi
andl \$-16, %esi
movl %esi, %edx
movl %eax, -4(%esi)
L85:
movl _D, %ecx
jmp L48
L91:
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L84
leal 16(%eax), %ecx
andl \$-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L84:
movl _D, %ecx
jmp L43
L89:
call ___getreent
movl _D, %esi
movl \$LC2, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl \$2, %eax
leal -12(%ebp), %esp
jmp L99
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16
# 4
.comm _Z, 16 # 4
.def _printf; .scl 3; .type 32; .endef
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef