Intel ARCHITECTURE IA-32 User Manual
Page 282

IA-32 Intel® Architecture Optimization
5-20
// START HORIZONTAL ADD
movaps xmm5, xmm0
// xmm5= A1,A2,A3,A4
movlhps xmm5, xmm1
// xmm5= A1,A2,B1,B2
movhlps xmm1, xmm0
// xmm1= A3,A4,B3,B4
addps xmm5, xmm1
// xmm5= A1+A3,A2+A4,B1+B3,B2+B4
movaps xmm4, xmm2
movlhps xmm2, xmm3
// xmm2= C1,C2,D1,D2
movhlps xmm3, xmm4
// xmm3= C3,C4,D3,D4
addps xmm3, xmm2
// xmm3= C1+C3,C2+C4,D1+D3,D2+D4
movaps xmm6, xmm3
// xmm6= C1+C3,C2+C4,D1+D3,D2+D4
shufps xmm3, xmm5, 0xDD
//xmm6=A1+A3,B1+B3,C1+C3,D1+D3
shufps xmm5, xmm6, 0x88
// xmm5= A2+A4,B2+B4,C2+C4,D2+D4
addps xmm6, xmm5
// xmm6= D,C,B,A
// END HORIZONTAL ADD
movaps [edx], xmm6
}
}
Example 5-9
Horizontal Add Using movhlps/movlhps (continued)