my program for solving the lower triangular equations,i testthe serial version and the version with AVX, result is:
Matrix Order : 10000x4
Matrix Band : 5000x4
Data Layout : 4x4 columned
Data Type : double
CPU : Corei7 2600
memory size : 16G
platform : VS2010
serial version: 0.357240s ( 100 cycles averaged )
SMID(AVX) version : 0.338360s ( 100 cycles averaged )
my question is theversion with AVX nearly unavailable, why?
this my code:
inline void solveL_pivot( double* x, const double* L )
{
double e0=x[ 0 ];
double e1=x[ 1 ];
double e2=x[ 2 ];
double e3=x[ 3 ];
e0/=L[ 0 ]; e1-=e0*L[ 1 ]; e2-=e0*L[ 2 ]; e3-=e0*L[ 3 ];
e1/=L[ 5 ]; e2-=e1*L[ 6 ]; e3-=e1*L[ 7 ];
e2/=L[ 10 ]; e3-=e2*L[ 11 ];
e3/=L[ 15 ];
x[ 0 ]=e0;
x[ 1 ]=e1;
x[ 2 ]=e2;
x[ 3 ]=e3;
}
void vsolveL_band( double* o, const double* c, unsigned int order, unsigned int band )
{
double* xproxy;
double* Lproxy;
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
double* x=o;
double* L=( double* )c;
unsigned int const stride=band<<4;
unsigned int const d=order-band;
unsigned int n=band-1;
for( unsigned int i=0; id ){ --n; }
xproxy=x;
Lproxy=( double* )L;
solveL_pivot( xproxy, Lproxy );
ymm0=_mm256_broadcast_sd( &xproxy[ 0 ] );
ymm1=_mm256_broadcast_sd( &xproxy[ 1 ] );
ymm2=_mm256_broadcast_sd( &xproxy[ 2 ] );
ymm3=_mm256_broadcast_sd( &xproxy[ 3 ] );
for( unsigned int k=0; kd ){ --n; }
solveL_pivot( o, c );
solveL_update( o, c+16, n );
o+=4; c+=stride;
}
solveL_pivot( o, c );
}


