I'm posting both my C code and the ASM code, both assume
Code:
typedef float vector4[4] __attribute__ ((aligned (16)));
#define vector3 vector4
benching using the rdtsc instruction gives me 92 clock cycles for the C version
and 52 clock cycles for the asm version, both inlined (the C one by hand)
Code:
void vector3_cross ( vector3 src, vector3 dest )
{
vector3 buf = {
src[1]*dest[2] - src[2]*dest[1],
src[0]*dest[2] - src[2]*dest[0],
src[0]*dest[1] + src[1]*dest[0]
};
vector3_copy(buf, dest);
}
Code:
static float neg_one = -1.0f
...
QMATH void vector3_cross ( vector3 src, vector3 dest )
{
/*
* the shufps imm value is a bit confusing,
* to get it:
*
* write out the reverse of what you want to
* rearrange the vector as
*
* (src) y x x w = w x x y
*
* x = 0 ... w = 3
*
* so:
* w x x y = 3 0 0 1
*
* shift needed:
* 3 0 0 1 *
* 4 1 4 1
* --------
* c 0 0 1
*
* combine
* c + 0 = c
* 0 + 1 = 1
*/
__asm__ volatile (
"movaps %[src], %%xmm0\n\t"
"movaps %[dest], %%xmm2\n\t"
"movaps %[src], %%xmm1\n\t"
"movaps %[dest], %%xmm3\n\t"
"shufps $0xc1, %%xmm0, %%xmm0\n\t"
"shufps $0xda, %%xmm2, %%xmm2\n\t"
"shufps $0xda, %%xmm1, %%xmm1\n\t"
"shufps $0xc1, %%xmm3, %%xmm3\n\t"
"mulps %%xmm0, %%xmm2\n\t"
"mulps %%xmm1, %%xmm3\n\t"
"subps %%xmm3, %%xmm2\n\t"
"shufps $0xb5, %%xmm2, %%xmm2\n\t"
"mulss %[neg_one], %%xmm2\n\t"
"shufps $0xb5, %%xmm2, %%xmm2\n\t"
"movaps %%xmm2, %[dest]\n\t"
: [dest] "+m" (*dest)
: [src] "m" (*src), [neg_one] "m" (neg_one)
: "xmm0", "xmm1", "xmm2", "xmm3"
);
}
Bookmarks