/* It's two times faster than pure C code. */ void matrix44_mult_SSE(float* r, const float* a, const float* b) { asm("movaps 0x00(%1),%%xmm0\n" "movaps 0x10(%1),%%xmm1\n" "movaps 0x20(%1),%%xmm2\n" "movaps 0x30(%1),%%xmm3\n" // 0, 1, 2, 3 "movaps 0x00(%2),%%xmm4\n" "movaps %%xmm4,%%xmm5\n" "movaps %%xmm4,%%xmm6\n" "movaps %%xmm4,%%xmm7\n" "shufps $0x00,%%xmm4,%%xmm4\n" "shufps $0x55,%%xmm5,%%xmm5\n" "shufps $0xAA,%%xmm6,%%xmm6\n" "shufps $0xFF,%%xmm7,%%xmm7\n" "mulps %%xmm0,%%xmm4\n" "mulps %%xmm1,%%xmm5\n" "mulps %%xmm2,%%xmm6\n" "mulps %%xmm3,%%xmm7\n" "addps %%xmm5,%%xmm4\n" "addps %%xmm6,%%xmm4\n" "addps %%xmm7,%%xmm4\n" "movaps %%xmm4,0x00(%0)\n" // 4, 5, 6, 7 "movaps 0x10(%2),%%xmm4\n" "movaps %%xmm4,%%xmm5\n" "movaps %%xmm4,%%xmm6\n" "movaps %%xmm4,%%xmm7\n" "shufps $0x00,%%xmm4,%%xmm4\n" "shufps $0x55,%%xmm5,%%xmm5\n" "shufps $0xAA,%%xmm6,%%xmm6\n" "shufps $0xFF,%%xmm7,%%xmm7\n" "mulps %%xmm0,%%xmm4\n" "mulps %%xmm1,%%xmm5\n" "mulps %%xmm2,%%xmm6\n" "mulps %%xmm3,%%xmm7\n" "addps %%xmm5,%%xmm4\n" "addps %%xmm6,%%xmm4\n" "addps %%xmm7,%%xmm4\n" "movaps %%xmm4,0x10(%0)\n" // 8, 9, 10, 11 "movaps 0x20(%2),%%xmm4\n" "movaps %%xmm4,%%xmm5\n" "movaps %%xmm4,%%xmm6\n" "movaps %%xmm4,%%xmm7\n" "shufps $0x00,%%xmm4,%%xmm4\n" "shufps $0x55,%%xmm5,%%xmm5\n" "shufps $0xAA,%%xmm6,%%xmm6\n" "shufps $0xFF,%%xmm7,%%xmm7\n" "mulps %%xmm0,%%xmm4\n" "mulps %%xmm1,%%xmm5\n" "mulps %%xmm2,%%xmm6\n" "mulps %%xmm3,%%xmm7\n" "addps %%xmm5,%%xmm4\n" "addps %%xmm6,%%xmm4\n" "addps %%xmm7,%%xmm4\n" "movaps %%xmm4,0x20(%0)\n" // 12, 13, 14, 15 "movaps 0x30(%2),%%xmm4\n" "movaps %%xmm4,%%xmm5\n" "movaps %%xmm4,%%xmm6\n" "movaps %%xmm4,%%xmm7\n" "shufps $0x00,%%xmm4,%%xmm4\n" "shufps $0x55,%%xmm5,%%xmm5\n" "shufps $0xAA,%%xmm6,%%xmm6\n" "shufps $0xFF,%%xmm7,%%xmm7\n" "mulps %%xmm0,%%xmm4\n" "mulps %%xmm1,%%xmm5\n" "mulps %%xmm2,%%xmm6\n" "mulps %%xmm3,%%xmm7\n" "addps %%xmm5,%%xmm4\n" "addps %%xmm6,%%xmm4\n" "addps %%xmm7,%%xmm4\n" "movaps %%xmm4,0x30(%0)\n" : : "r" (r), "r" (a), "r" (b) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); }