1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | typedef struct { u32 x; u32 y; u32 z; } V3; typedef union { struct { u32 x; u32 y; u32 z; u32 padding; } components; __m128i sse; } V3U; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | V3 *AddVectors(V3 *va, V3 *vb, u32 size) { V3 *result = calloc(size, sizeof(V3)); for(u32 i = 0; i < size; i++) { result[i].x = va[i].x + vb[i].x; result[i].y = va[i].y + vb[i].y; result[i].z = va[i].z + vb[i].z; } return result; } V3U *AddVectors_128(V3U *va, V3U *vb, u32 size) { V3U *result = calloc(size, sizeof(V3U)); for(u32 i = 0; i < size; i++) { result[i].sse = _mm_add_epi32(va[i].sse, vb[i].sse); } return result; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | inline void StartCounter() { QueryPerformanceFrequency(&Frequency); QueryPerformanceCounter(&StartingTime); } inline long long GetCounter() { QueryPerformanceCounter(&EndingTime); ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; ElapsedMicroseconds.QuadPart *= 1000000; ElapsedMicroseconds.QuadPart /= Frequency.QuadPart; return ElapsedMicroseconds.QuadPart; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 | V3U *AddVectors_128(V3U *va, V3U *vb, u32 size) { V3U *result = calloc(size, sizeof(V3U)); for(u32 i = 0; i < size; i+=4) { result[i].sse = _mm_add_epi32(va[i].sse, vb[i].sse); result[i+1].sse = _mm_add_epi32(va[i+1].sse, vb[i+1].sse); result[i+2].sse = _mm_add_epi32(va[i+2].sse, vb[i+2].sse); result[i+3].sse = _mm_add_epi32(va[i+3].sse, vb[i+3].sse); } return result; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 | V3U *AddVectors_128(V3U *va, V3U *vb, u32 size) { V3U *result = calloc(size, sizeof(V3U)); for(u32 i = 0; i < size; i+=4) { (*result++).sse = _mm_add_epi32((*va++).sse, (*vb++).sse); (*result++).sse = _mm_add_epi32((*va++).sse, (*vb++).sse); (*result++).sse = _mm_add_epi32((*va++).sse, (*vb++).sse); (*result++).sse = _mm_add_epi32((*va++).sse, (*vb++).sse); } return result; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | V3U *AddVectors_128(V3U *va, V3U *vb, u32 size) { V3U *result = calloc(size, sizeof(V3U)); V3U *pva = va; V3U *pvb = vb; V3U *pvc = result; for(u32 i = 0; i < size; i+=4) { (*pvc++).sse = _mm_add_epi32((*pva++).sse, (*pvb++).sse); (*pvc++).sse = _mm_add_epi32((*pva++).sse, (*pvb++).sse); (*pvc++).sse = _mm_add_epi32((*pva++).sse, (*pvb++).sse); (*pvc++).sse = _mm_add_epi32((*pva++).sse, (*pvb++).sse); } return result; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | 0000000000000040: F3 0F 6F 0B movdqu xmm1,xmmword ptr [rbx] 0000000000000044: 4D 8D 49 40 lea r9,[r9+40h] 0000000000000048: 66 0F FE 0F paddd xmm1,xmmword ptr [rdi] 000000000000004C: 48 8D 5B 40 lea rbx,[rbx+40h] 0000000000000050: F3 41 0F 7F 49 C0 movdqu xmmword ptr [r9-40h],xmm1 0000000000000056: 48 8D 7F 40 lea rdi,[rdi+40h] 000000000000005A: F3 0F 6F 4B D0 movdqu xmm1,xmmword ptr [rbx-30h] 000000000000005F: 66 0F FE 4F D0 paddd xmm1,xmmword ptr [rdi-30h] 0000000000000064: F3 41 0F 7F 49 D0 movdqu xmmword ptr [r9-30h],xmm1 000000000000006A: F3 0F 6F 4B E0 movdqu xmm1,xmmword ptr [rbx-20h] 000000000000006F: 66 0F FE 4F E0 paddd xmm1,xmmword ptr [rdi-20h] 0000000000000074: F3 41 0F 7F 49 E0 movdqu xmmword ptr [r9-20h],xmm1 000000000000007A: F3 0F 6F 47 F0 movdqu xmm0,xmmword ptr [rdi-10h] 000000000000007F: F3 0F 6F 4B F0 movdqu xmm1,xmmword ptr [rbx-10h] 0000000000000084: 66 0F FE C8 paddd xmm1,xmm0 0000000000000088: F3 41 0F 7F 49 F0 movdqu xmmword ptr [r9-10h],xmm1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | V3 *NormalizeVectors(V3 *va, u32 size) { V3 *result = calloc(size, sizeof(V3)); for(u32 i = 0; i < size; i+=4) { f32 l = (f32)sqrt(va[i].x * va[i].x + va[i].y * va[i].y + va[i].z * va[i].z); result[i].x = va[i].x / l; result[i].y = va[i].y / l; result[i].z = va[i].z / l; result[i].padding = va[i].padding / l; // for keeping the functions comparable l = (f32)sqrt(va[i+1].x * va[i+1].x + va[i+1].y * va[i+1].y + va[i+1].z * va[i+1].z); result[i+1].x = va[i+1].x / l; result[i+1].y = va[i+1].y / l; result[i+1].z = va[i+1].z / l; result[i+1].padding = va[i+1].padding / l; l = (f32)sqrt(va[i+2].x * va[i+2].x + va[i+2].y * va[i+2].y + va[i+2].z * va[i+2].z); result[i+2].x = va[i+2].x / l; result[i+2].y = va[i+2].y / l; result[i+2].z = va[i+2].z / l; result[i+2].padding = va[i+2].padding / l; l = (f32)sqrt(va[i+3].x * va[i+3].x + va[i+3].y * va[i+3].y + va[i+3].z * va[i+3].z); result[i+3].x = va[i+3].x / l; result[i+3].y = va[i+3].y / l; result[i+3].z = va[i+3].z / l; result[i+3].padding = va[i+3].padding / l; } return result; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | V3U *NormalizeVectors_128(V3U *va, u32 size) { V3U *result = calloc(size, sizeof(V3U)); __m128 lqs_128; __m128 lq_128; __m128 l1_128; __m128 l2_128; __m128 l3_128; __m128 l4_128; for(u32 i = 0; i < size; i+=4) { lqs_128 = _mm_set_ps(va[i].components.x * va[i].components.x + va[i].components.y * va[i].components.y + va[i].components.z * va[i].components.z, va[i+1].components.x * va[i+1].components.x + va[i+1].components.y * va[i+1].components.y + va[i+1].components.z * va[i+1].components.z, va[i+2].components.x * va[i+2].components.x + va[i+2].components.y * va[i+2].components.y + va[i+2].components.z * va[i+2].components.z, va[i+3].components.x * va[i+3].components.x + va[i+3].components.y * va[i+3].components.y + va[i+3].components.z * va[i+3].components.z); lq_128 = _mm_sqrt_ps(lqs_128); l1_128 = _mm_set_ps1(lq_128.m128_f32[3]); l2_128 = _mm_set_ps1(lq_128.m128_f32[2]); l3_128 = _mm_set_ps1(lq_128.m128_f32[1]); l4_128 = _mm_set_ps1(lq_128.m128_f32[0]); result[i].sse = _mm_div_ps(va[i].sse, l1_128); result[i+1].sse = _mm_div_ps(va[i+1].sse, l2_128); result[i+2].sse = _mm_div_ps(va[i+2].sse, l3_128); result[i+3].sse = _mm_div_ps(va[i+3].sse, l4_128); } return result; } |
1 | __m128 rsqrt = _mm_rsqrt_ps(dot); |
1 2 3 | __m128 rsqrt = _mm_rsqrt_ps(dot); __m128 tmp = _mm_mul_ps(_mm_mul_ps(dot, rsqrt), rsqrt); rsqrt = _mm_mul_ps(_mm_mul_ps(_mm_set_ps1(0.5f), rsqrt), _mm_sub_ps(_mm_set_ps1(3.0f), tmp)); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | __m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8) ... DEFINE SELECT4(src, control) { CASE(control[1:0]) OF 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(b[127:0], imm8[5:4]) dst[127:96] := SELECT4(b[127:0], imm8[7:6]) |
1 | C = _mm_shuffle_ps(a, b, _MM_SHUFFLE(x, y, z, w)); |
1 2 3 4 | c[0] = a[w]; c[1] = a[z]; c[2] = b[y]; c[3] = b[x]; |
1 2 | Description Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst. |