Challenge
Rearrange (deswizzle) data from SoA (Structure of Arrays) format to AoS (Array of Structures) format. In the deswizzle operation, we want to arrange the data so the xxxx, yyyy, zzzz are rearranged and stored in memory as xyz.
Solution
Use the unpcklps/unpckhps instructions to regenerate the xyxy layout and then store each half (xy) into its corresponding memory location using movlps/movhps followed by another movlps/movhps to store the z component. The following code illustrates the deswizzle function:
void deswizzle_asm(Vertex_soa *in, Vertex_aos *out)
{
__asm {
mov ecx, in // load structure addresses
mov edx, out
movaps xmm7, [ecx] // load x1 x2 x3 x4 => xmm7
movaps xmm6, [ecx+16] // load y1 y2 y3 y4 => xmm6
movaps xmm5, [ecx+32] // load z1 z2 z3 z4 => xmm5
movaps xmm4, [ecx+48] // load w1 w2 w3 w4 => xmm4
// START THE DESWIZZLING HERE
movaps xmm0, xmm7 // xmm0= x1 x2 x3 x4
unpcklps xmm7, xmm6 // xmm7= x1 y1 x2 y2
movlps [edx], xmm7 // v1 = x1 y1 -- --
movhps [edx+16], xmm7 // v2 = x2 y2 -- --
unpckhps xmm0, xmm6 // xmm0= x3 y3 x4 y4
movlps [edx+32], xmm0 // v3 = x3 y3 -- --
movhps [edx+48], xmm0 // v4 = x4 y4 -- --
movaps xmm0, xmm5 // xmm0= z1 z2 z3 z4
unpcklps xmm5, xmm4 // xmm5= z1 w1 z2 w2
unpckhps xmm0, xmm4 // xmm0= z3 w3 z4 w4
movlps [edx+8], xmm5 // v1 = x1 y1 z1 w1
movhps [edx+24], xmm5 // v2 = x2 y2 z2 w2
movlps [edx+40], xmm0 // v3 = x3 y3 z3 w3
movhps [edx+56], xmm0 // v4 = x4 y4 z4 w4
// DESWIZZLING ENDS HERE
}
}
|
You may have to swizzle data in the registers, but not in memory. This occurs when two different functions need to process the data in different layout. In lighting, for example, data comes as rrrr gggg bbbb aaaa, and you must deswizzle them into rgba before converting into integers. In this case, you use the movlhps/movhlps instructions to do the first part of the deswizzle, followed by shuffle instructions.
The following code deswizzles data using the movlhps and shuffle instructions:
void deswizzle_rgb(Vertex_soa *in, Vertex_aos *out)
{
//---deswizzle rgb---
// assume: xmm1=rrrr, xmm2=gggg, xmm3=bbbb, xmm4=aaaa
__asm {
mov ecx, in // load structure addresses
mov edx, out
movaps xmm1, [ecx] // load r1 r2 r3 r4 => xmm1
movaps xmm2, [ecx+16] // load g1 g2 g3 g4 => xmm2
movaps xmm3, [ecx+32] // load b1 b2 b3 b4 => xmm3
movaps xmm4, [ecx+48] // load a1 a2 a3 a4 => xmm4
// START DESWIZZLING HERE
movaps xmm7, xmm4 // xmm7= a1 a2 a3 a4
movhlps xmm7, xmm3 // xmm7= b3 b4 a3 a4
movaps xmm6, xmm2 // xmm6= g1 g2 g3 g4
movlhps xmm3, xmm4 // xmm3= b1 b2 a1 a2
movhlps xmm2, xmm1 // xmm2= r3 r4 g3 g4
movlhps xmm1, xmm6 // xmm1= r1 r2 g1 g2
movaps xmm6, xmm2 // xmm6= r3 r4 g3 g4
movaps xmm5, xmm1 // xmm5= r1 r2 g1 g2
shufps xmm2, xmm7, 0xDD // xmm2= r4 g4 b4 a4
shufps xmm1, xmm3, 0x88 // xmm4= r1 g1 b1 a1
shufps xmm5, xmm3, 0x88 // xmm5= r2 g2 b2 a2
shufps xmm6, xmm7, 0xDD // xmm6= r3 g3 b3 a3
movaps [edx], xmm4 // v1 = r1 g1 b1 a1
movaps [edx+16], xmm5 // v2 = r2 g2 b2 a2
movaps [edx+32], xmm6 // v3 = r3 g3 b3 a3
movaps [edx+48], xmm2 // v4 = r4 g4 b4 a4
// DESWIZZLING ENDS HERE
}
}
|
The following code deswizzles data as 64-bit integer SIMD data:
void mmx_deswizzle(IVertex_soa *in, IVertex_aos *out)
{
__asm {
mov ebx, in
mov edx, out
movq mm0, [ebx] // mm0= u1 u2
movq mm1, [ebx+16] // mm1= v1 v2
movq mm2, mm0 // mm2= u1 u2
punpckhdq mm0, mm1 // mm0= u1 v1
punpckldq mm2, mm1 // mm0= u2 v2
movq [edx], mm2 // store u1 v1
movq [edx+8], mm0 // store u2 v2
movq mm4, [ebx+8] // mm0= u3 u4
movq mm5, [ebx+24] // mm1= v3 v4
movq mm6, mm4 // mm2= u3 u4
punpckhdq mm4, mm5 // mm0= u3 v3
punpckldq mm6, mm5 // mm0= u4 v4
movq [edx+16], mm6 // store u3v3
movq [edx+24], mm4 // store u4v4
}
}
|
Source
Intel® 64 and IA-32 Architectures Optimization Reference Manual