# 超标量编程 101（矩阵相乘）第 3 部分（共 5 部分）

Cilk++ 方法使用常见的“分治方法”（或区块方法）将矩阵分割成较小的工作集，恰好适合线程可用的高速缓存。这是一个很好的起点。接下来我们需要做的是，如何在系统处理器内的高速缓存之间协调相关活动。

```// divide the iteration space across each multi-core processor
// L3\$ specifies by L3 cache
//		.OR.
// within processor (Socket) for processors without L3 cache
parallel_for( OneEach_L3\$, intptr_t(0), size,
[&](intptr_t iBeginL3, intptr_t iEndL3)
{
// divide our L3 iteration space by L2 within this threads L3
parallel_for( OneEach_Within_L3\$ + L2\$, iBeginL3, iEndL3,
[&](intptr_t iBeginL2, intptr_t iEndL2)
{
// Here we are running as the Master thread of a
// 2 team member team (or 1 in the event of older
// processor)
//
// Now bring in our other team member(s)
parallel_distribute( L2\$,
[&](intptr_t iTMinL2, intptr_t nTMinL2)
{
// ... (Do Work)
} // [&](intptr_t iTMinL2, intptr_t nTMinL2)
); // parallel_distribute( L2\$,
} // [&](intptr_t iBeginL2, intptr_t iEndL2)
); // parallel_for( OneEach_Within_L3\$ + L2\$, iBeginL3, iEndL3,
} // [&](intptr_t iBeginL3, intptr_t iEndL3)
); // parallel_for( OneEach_L3\$, intptr_t(0), size,
```

parallel_for( OneEach_M0\$, intptr_t(0), size,
...

```// compute DOT product of two vectors, returns result as double
double DOT(double v1[], double v2[], intptr_t size)
{
double temp = 0.0;
for(intptr_t i = 0; i < size; i++)
{
temp += v1[i] * v2[i];
}
return temp;
}

double xmmDOT(double v1[], double v2[], intptr_t size)
{
// __declspec(align(16)) not working reliably for me
double temp[4];
intptr_t alignedTemp = (((intptr_t)&temp[0]) & 8) >> 3;

__m128d	_temp = _mm_set_pd(0.0, 0.0);
__m128d *_v1 = (__m128d *)v1;
__m128d *_v2 = (__m128d *)v2;

intptr_t	halfSize = size / 2;

for(intptr_t i = 0; i < halfSize; i++)
{
}
// fix code to remove temp[4] array
_mm_store_pd( &temp[alignedTemp], _temp);
if(size & 1)
temp[alignedTemp] += v1[size-1] * v2[size-1];

return temp[alignedTemp] + temp[alignedTemp+1];
}

// compute two DOT products at once
// effectively
// r[0] = DOT(v1, v2, size);
// r[1] = DOT(v1, v3, size);
// except running both results at the same time
void DOTDOT(double v1[], double v2[], double v3[], double r[2],  intptr_t size)
{
double	temp[2];
temp[0] = 0.0;
temp[1] = 0.0;
for(int i=0; i < size; ++i)
{
temp[0] += v1[i] * v2[i];
temp[1] += v1[i] * v3[i];
} // for(int i=0; i < size; ++i)
r[0] = temp[0];
r[1] = temp[1];
}

// compute two DOT products at once
// effectively
// r[0] = DOT(v1, v2, size);
// r[1] = DOT(v1, v3, size);
// except running both results at the same time
void xmmDOTDOT(double v1[], double v2[], double v3[], double r[2],  intptr_t size)
{
// __declspec(align(16)) not working reliably for me
double	temp[6];
intptr_t alignedTemp = (((intptr_t)&temp[0]) & 8) >> 3;
__m128d	_temp0 = _mm_set_pd(0.0, 0.0);
__m128d	_temp1 = _mm_set_pd(0.0, 0.0);
__m128d *_v1 = (__m128d *)v1;
__m128d *_v2 = (__m128d *)v2;
__m128d *_v3 = (__m128d *)v3;

intptr_t	halfSize = size / 2;

for(intptr_t i = 0; i < halfSize; i++)
{
}
_mm_store_pd( &temp[alignedTemp], _temp0);
_mm_store_pd( &temp[alignedTemp+2], _temp1);
if(size & 1)
{
temp[alignedTemp] += v1[size-1] * v2[size-1];
temp[alignedTemp+2] += v1[size-1] * v3[size-1];
}

r[0] =  temp[alignedTemp] + temp[alignedTemp+1];
r[1] =  temp[alignedTemp+2] + temp[alignedTemp+3];
}

bool UseXMM = false;

double doDOT(double v1[], double v2[], intptr_t size)
{
if(UseXMM)
return xmmDOT(v1, v2, size);
return DOT(v1, v2, size);
}

void doDOTDOT(double v1[], double v2[], double v3[], double r[2],  intptr_t size)
{
if(UseXMM)
xmmDOTDOT(v1, v2, v3, r, size);
else
DOTDOT(v1, v2, v3, r, size);
}
```

15% 通常是微不足道的，然而，请注意并行双打方法和 Cilk++ 方法似乎在 1024 点出现了下降。对于大型矩阵，尤其是在多插槽系统上，应当运行更多测试。当我有机会收集此类数据时，我定会公布关于该性能测试的最新信息。

Jim Dempsey