1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325 | #include <immintrin.h>
void DrawTexture_Optimized(ui32* colorBufferData, v2i colorBufferSize, i32 colorBufferPitch, Quadf targetRect_screenCoords, RenderEntry_Texture image, Rectf clipRect)
{
v2f origin = targetRect_screenCoords.bottomLeft;
v2f targetRectXAxis = targetRect_screenCoords.bottomRight - targetRect_screenCoords.bottomLeft;
v2f targetRectYAxis = targetRect_screenCoords.topLeft - targetRect_screenCoords.bottomLeft;
i32 widthMax = (i32)clipRect.max.x;
i32 heightMax = (i32)clipRect.max.y;
i32 xMin = widthMax;
i32 xMax = (i32)clipRect.min.x;
i32 yMin = heightMax;
i32 yMax = (i32)clipRect.min.y;
{ //Optimization to avoid iterating over every pixel on the screen - HH ep 92
Array<v2f, 4> vecs = { origin, origin + targetRectXAxis, origin + targetRectXAxis + targetRectYAxis, origin + targetRectYAxis };
for (i32 vecIndex = 0; vecIndex < vecs.Size(); ++vecIndex)
{
v2f testVec = vecs.At(vecIndex);
i32 flooredX = FloorF32ToI32(testVec.x);
i32 ceiledX = CeilF32ToI32(testVec.x);
i32 flooredY = FloorF32ToI32(testVec.y);
i32 ceiledY = CeilF32ToI32(testVec.y);
if (xMin > flooredX)
xMin = flooredX;
if (yMin > flooredY)
yMin = flooredY;
if (xMax < ceiledX)
xMax = ceiledX;
if (yMax < ceiledY)
yMax = ceiledY;
}
if (xMin < (i32)clipRect.min.x)
xMin = (i32)clipRect.min.x;
if (yMin < clipRect.min.y)
yMin = (i32)clipRect.min.y;
if (xMax > widthMax)
xMax = widthMax;
if (yMax > heightMax)
yMax = heightMax;
};
i32 simdWidth_inBytes = 8;
//Align to 8-byte boundry
if ((xMin % simdWidth_inBytes) != 0)
xMin = (i32)RoundDown((sizet)xMin, simdWidth_inBytes);
//Pre calcuations for optimization
f32 invertedXAxisSqd = 1.0f / MagnitudeSqd(targetRectXAxis);
f32 invertedYAxisSqd = 1.0f / MagnitudeSqd(targetRectYAxis);
i32 imageWidth = image.size.width - 3;
i32 imageHeight = image.size.height - 3;
v2f normalizedXAxis = invertedXAxisSqd * targetRectXAxis;
v2f normalizedYAxis = invertedYAxisSqd * targetRectYAxis;
i32 sizeOfPixel_inBytes = 4;
ui8* currentRow = (ui8*)colorBufferData + (i32)xMin * sizeOfPixel_inBytes + (i32)yMin * colorBufferPitch;
for (i32 screenY = yMin; screenY < yMax; ++screenY)
{
ui32* destPixel = (ui32*)currentRow;
for (i32 screenX = xMin; screenX < xMax; screenX += simdWidth_inBytes)
{
//Initial setup variables for SIMD code
__m256 one = _mm256_set1_ps(1.0f);
__m256 zero = _mm256_set1_ps(0.0f);
__m256 imgWidth = _mm256_set1_ps((f32)imageWidth);
__m256 imgHeight = _mm256_set1_ps((f32)imageHeight);
__m256 normalizedXAxis_x = _mm256_set1_ps(normalizedXAxis.x);
__m256 normalizedXAxis_y = _mm256_set1_ps(normalizedXAxis.y);
__m256 normalizedYAxis_x = _mm256_set1_ps(normalizedYAxis.x);
__m256 normalizedYAxis_y = _mm256_set1_ps(normalizedYAxis.y);
__m256 targetRectOrigin_x = _mm256_set1_ps(origin.x);
__m256 targetRectOrigin_y = _mm256_set1_ps(origin.y);
__m256 screenPixelCoords_x = _mm256_set_ps((f32)(screenX + 7), (f32)(screenX + 6), (f32)(screenX + 5), (f32)(screenX + 4), (f32)(screenX + 3), (f32)(screenX + 2), (f32)(screenX + 1), (f32)(screenX + 0));
__m256 screenPixelCoords_y = _mm256_set1_ps((f32)screenY);
__m256 uvRangeForTexture_u = _mm256_set1_ps(image.uvBounds.At(1).u - image.uvBounds.At(0).u);
__m256 uvRangeForTexture_v = _mm256_set1_ps(image.uvBounds.At(1).v - image.uvBounds.At(0).v);
__m256 minUVBounds_u = _mm256_set1_ps(image.uvBounds.At(0).u);
__m256 minUVBounds_v = _mm256_set1_ps(image.uvBounds.At(0).v);
//Gather normalized coordinates (uv's) in order to find the correct texel position below
__m256 dXs = _mm256_sub_ps(screenPixelCoords_x, targetRectOrigin_x);
__m256 dYs = _mm256_sub_ps(screenPixelCoords_y, targetRectOrigin_y);
__m256 Us = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedXAxis_x), _mm256_mul_ps(dYs, normalizedXAxis_y));
__m256 Vs = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedYAxis_x), _mm256_mul_ps(dYs, normalizedYAxis_y));
/* clang-format off */
//Using a mask to determine what colors final 8 wide pixel destintion buffer should except
//(background texels or image texels). This replaces the need for a conditional
__m256i writeMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(Us, zero, _CMP_GE_OQ),
_mm256_cmp_ps(Us, one, _CMP_LE_OQ)),
_mm256_and_ps(_mm256_cmp_ps(Vs, zero, _CMP_GE_OQ),
_mm256_cmp_ps(Vs, one, _CMP_LE_OQ))));
/* clang-format on */
__m256i clipMask = _mm256_set1_epi32(0xFFFFFFFF);
//See how much final 8 pixel wide dest buffer will expand past the max boundry of screen region (if at all)
//and adjust it
if (screenX > ((i32)widthMax - simdWidth_inBytes))
{
i32 diff = (i32)widthMax - (i32)screenX;
i32 amountOfScreenOverflow = simdWidth_inBytes - diff;
i32 index { 7 };
while (amountOfScreenOverflow)
{
clipMask.m256i_u32[index] = 0;
index -= 1;
--amountOfScreenOverflow;
};
};
//Clamp UVs to prevent accessing memory that is invalid
Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);
__m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
__m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));
__m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
__m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);
__m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
for (i32 index {}; index < 8; ++index)
{
BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");
//Gather 4 texels (in a square pattern) from certain texel Ptr
ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
};
#if __AVX2__
//Unpack 4 sample texels to prepare for bilinear blend
__m256i maskFF = _mm256_set1_epi32(0xFF);
__m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
__m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
__m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
__m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
__m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
__m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
__m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
__m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
__m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
__m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
__m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
__m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
__m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
__m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
__m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
__m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
__m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
__m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
__m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
__m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
__m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));
#elif __AVX__
//Unpack 4 sample texels to prepare for bilinear blend
__m256i* ptrToSampleTexelAs = &sampleTexelAs;
__m256 texelA_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 28), (f32) * ((ui8*)ptrToSampleTexelAs + 24), (f32) * ((ui8*)ptrToSampleTexelAs + 20), (f32) * ((ui8*)ptrToSampleTexelAs + 16), (f32) * ((ui8*)ptrToSampleTexelAs + 12), (f32) * ((ui8*)ptrToSampleTexelAs + 8), (f32) * ((ui8*)ptrToSampleTexelAs + 4), (f32) * ((ui8*)ptrToSampleTexelAs + 0));
__m256 texelA_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 29), (f32) * ((ui8*)ptrToSampleTexelAs + 25), (f32) * ((ui8*)ptrToSampleTexelAs + 21), (f32) * ((ui8*)ptrToSampleTexelAs + 17), (f32) * ((ui8*)ptrToSampleTexelAs + 13), (f32) * ((ui8*)ptrToSampleTexelAs + 9), (f32) * ((ui8*)ptrToSampleTexelAs + 5), (f32) * ((ui8*)ptrToSampleTexelAs + 1));
__m256 texelA_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 30), (f32) * ((ui8*)ptrToSampleTexelAs + 26), (f32) * ((ui8*)ptrToSampleTexelAs + 22), (f32) * ((ui8*)ptrToSampleTexelAs + 18), (f32) * ((ui8*)ptrToSampleTexelAs + 14), (f32) * ((ui8*)ptrToSampleTexelAs + 10), (f32) * ((ui8*)ptrToSampleTexelAs + 6), (f32) * ((ui8*)ptrToSampleTexelAs + 2));
__m256 texelA_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 31), (f32) * ((ui8*)ptrToSampleTexelAs + 27), (f32) * ((ui8*)ptrToSampleTexelAs + 23), (f32) * ((ui8*)ptrToSampleTexelAs + 19), (f32) * ((ui8*)ptrToSampleTexelAs + 15), (f32) * ((ui8*)ptrToSampleTexelAs + 11), (f32) * ((ui8*)ptrToSampleTexelAs + 7), (f32) * ((ui8*)ptrToSampleTexelAs + 3));
__m256i* ptrToSampleTexelBs = &sampleTexelBs;
__m256 texelB_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 28), (f32) * ((ui8*)ptrToSampleTexelBs + 24), (f32) * ((ui8*)ptrToSampleTexelBs + 20), (f32) * ((ui8*)ptrToSampleTexelBs + 16), (f32) * ((ui8*)ptrToSampleTexelBs + 12), (f32) * ((ui8*)ptrToSampleTexelBs + 8), (f32) * ((ui8*)ptrToSampleTexelBs + 4), (f32) * ((ui8*)ptrToSampleTexelBs + 0));
__m256 texelB_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 29), (f32) * ((ui8*)ptrToSampleTexelBs + 25), (f32) * ((ui8*)ptrToSampleTexelBs + 21), (f32) * ((ui8*)ptrToSampleTexelBs + 17), (f32) * ((ui8*)ptrToSampleTexelBs + 13), (f32) * ((ui8*)ptrToSampleTexelBs + 9), (f32) * ((ui8*)ptrToSampleTexelBs + 5), (f32) * ((ui8*)ptrToSampleTexelBs + 1));
__m256 texelB_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 30), (f32) * ((ui8*)ptrToSampleTexelBs + 26), (f32) * ((ui8*)ptrToSampleTexelBs + 22), (f32) * ((ui8*)ptrToSampleTexelBs + 18), (f32) * ((ui8*)ptrToSampleTexelBs + 14), (f32) * ((ui8*)ptrToSampleTexelBs + 10), (f32) * ((ui8*)ptrToSampleTexelBs + 6), (f32) * ((ui8*)ptrToSampleTexelBs + 2));
__m256 texelB_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 31), (f32) * ((ui8*)ptrToSampleTexelBs + 27), (f32) * ((ui8*)ptrToSampleTexelBs + 23), (f32) * ((ui8*)ptrToSampleTexelBs + 19), (f32) * ((ui8*)ptrToSampleTexelBs + 15), (f32) * ((ui8*)ptrToSampleTexelBs + 11), (f32) * ((ui8*)ptrToSampleTexelBs + 7), (f32) * ((ui8*)ptrToSampleTexelBs + 3));
__m256i* ptrToSampleTexelCs = &sampleTexelCs;
__m256 texelC_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 28), (f32) * ((ui8*)ptrToSampleTexelCs + 24), (f32) * ((ui8*)ptrToSampleTexelCs + 20), (f32) * ((ui8*)ptrToSampleTexelCs + 16), (f32) * ((ui8*)ptrToSampleTexelCs + 12), (f32) * ((ui8*)ptrToSampleTexelCs + 8), (f32) * ((ui8*)ptrToSampleTexelCs + 4), (f32) * ((ui8*)ptrToSampleTexelCs + 0));
__m256 texelC_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 29), (f32) * ((ui8*)ptrToSampleTexelCs + 25), (f32) * ((ui8*)ptrToSampleTexelCs + 21), (f32) * ((ui8*)ptrToSampleTexelCs + 17), (f32) * ((ui8*)ptrToSampleTexelCs + 13), (f32) * ((ui8*)ptrToSampleTexelCs + 9), (f32) * ((ui8*)ptrToSampleTexelCs + 5), (f32) * ((ui8*)ptrToSampleTexelCs + 1));
__m256 texelC_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 30), (f32) * ((ui8*)ptrToSampleTexelCs + 26), (f32) * ((ui8*)ptrToSampleTexelCs + 22), (f32) * ((ui8*)ptrToSampleTexelCs + 18), (f32) * ((ui8*)ptrToSampleTexelCs + 14), (f32) * ((ui8*)ptrToSampleTexelCs + 10), (f32) * ((ui8*)ptrToSampleTexelCs + 6), (f32) * ((ui8*)ptrToSampleTexelCs + 2));
__m256 texelC_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 31), (f32) * ((ui8*)ptrToSampleTexelCs + 27), (f32) * ((ui8*)ptrToSampleTexelCs + 23), (f32) * ((ui8*)ptrToSampleTexelCs + 19), (f32) * ((ui8*)ptrToSampleTexelCs + 15), (f32) * ((ui8*)ptrToSampleTexelCs + 11), (f32) * ((ui8*)ptrToSampleTexelCs + 7), (f32) * ((ui8*)ptrToSampleTexelCs + 3));
__m256i* ptrToSampleTexelDs = &sampleTexelDs;
__m256 texelD_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 28), (f32) * ((ui8*)ptrToSampleTexelDs + 24), (f32) * ((ui8*)ptrToSampleTexelDs + 20), (f32) * ((ui8*)ptrToSampleTexelDs + 16), (f32) * ((ui8*)ptrToSampleTexelDs + 12), (f32) * ((ui8*)ptrToSampleTexelDs + 8), (f32) * ((ui8*)ptrToSampleTexelDs + 4), (f32) * ((ui8*)ptrToSampleTexelDs + 0));
__m256 texelD_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 29), (f32) * ((ui8*)ptrToSampleTexelDs + 25), (f32) * ((ui8*)ptrToSampleTexelDs + 21), (f32) * ((ui8*)ptrToSampleTexelDs + 17), (f32) * ((ui8*)ptrToSampleTexelDs + 13), (f32) * ((ui8*)ptrToSampleTexelDs + 9), (f32) * ((ui8*)ptrToSampleTexelDs + 5), (f32) * ((ui8*)ptrToSampleTexelDs + 1));
__m256 texelD_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 30), (f32) * ((ui8*)ptrToSampleTexelDs + 26), (f32) * ((ui8*)ptrToSampleTexelDs + 22), (f32) * ((ui8*)ptrToSampleTexelDs + 18), (f32) * ((ui8*)ptrToSampleTexelDs + 14), (f32) * ((ui8*)ptrToSampleTexelDs + 10), (f32) * ((ui8*)ptrToSampleTexelDs + 6), (f32) * ((ui8*)ptrToSampleTexelDs + 2));
__m256 texelD_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 31), (f32) * ((ui8*)ptrToSampleTexelDs + 27), (f32) * ((ui8*)ptrToSampleTexelDs + 23), (f32) * ((ui8*)ptrToSampleTexelDs + 19), (f32) * ((ui8*)ptrToSampleTexelDs + 15), (f32) * ((ui8*)ptrToSampleTexelDs + 11), (f32) * ((ui8*)ptrToSampleTexelDs + 7), (f32) * ((ui8*)ptrToSampleTexelDs + 3));
__m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
__m256i* ptrToBackgroundPixels = &backGroundPixels;
__m256 backgroundColors_b = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 28), (f32) * ((ui8*)ptrToBackgroundPixels + 24), (f32) * ((ui8*)ptrToBackgroundPixels + 20), (f32) * ((ui8*)ptrToBackgroundPixels + 16), (f32) * ((ui8*)ptrToBackgroundPixels + 12), (f32) * ((ui8*)ptrToBackgroundPixels + 8), (f32) * ((ui8*)ptrToBackgroundPixels + 4), (f32) * ((ui8*)ptrToBackgroundPixels + 0));
__m256 backgroundColors_g = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 29), (f32) * ((ui8*)ptrToBackgroundPixels + 25), (f32) * ((ui8*)ptrToBackgroundPixels + 21), (f32) * ((ui8*)ptrToBackgroundPixels + 17), (f32) * ((ui8*)ptrToBackgroundPixels + 13), (f32) * ((ui8*)ptrToBackgroundPixels + 9), (f32) * ((ui8*)ptrToBackgroundPixels + 5), (f32) * ((ui8*)ptrToBackgroundPixels + 1));
__m256 backgroundColors_r = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 30), (f32) * ((ui8*)ptrToBackgroundPixels + 26), (f32) * ((ui8*)ptrToBackgroundPixels + 22), (f32) * ((ui8*)ptrToBackgroundPixels + 18), (f32) * ((ui8*)ptrToBackgroundPixels + 14), (f32) * ((ui8*)ptrToBackgroundPixels + 10), (f32) * ((ui8*)ptrToBackgroundPixels + 6), (f32) * ((ui8*)ptrToBackgroundPixels + 2));
__m256 backgroundColors_a = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 31), (f32) * ((ui8*)ptrToBackgroundPixels + 27), (f32) * ((ui8*)ptrToBackgroundPixels + 23), (f32) * ((ui8*)ptrToBackgroundPixels + 19), (f32) * ((ui8*)ptrToBackgroundPixels + 15), (f32) * ((ui8*)ptrToBackgroundPixels + 11), (f32) * ((ui8*)ptrToBackgroundPixels + 7), (f32) * ((ui8*)ptrToBackgroundPixels + 3));
#endif
//Bilinear blend
__m256 percentToLerpInX = _mm256_sub_ps(texelCoords_x, _mm256_floor_ps(texelCoords_x));
__m256 percentToLerpInY = _mm256_sub_ps(texelCoords_y, _mm256_floor_ps(texelCoords_y));
__m256 oneMinusXLerp = _mm256_sub_ps(one, percentToLerpInX);
__m256 oneMinusYLerp = _mm256_sub_ps(one, percentToLerpInY);
__m256 coefficient1 = _mm256_mul_ps(oneMinusYLerp, oneMinusXLerp);
__m256 coefficient2 = _mm256_mul_ps(oneMinusYLerp, percentToLerpInX);
__m256 coefficient3 = _mm256_mul_ps(percentToLerpInY, oneMinusXLerp);
__m256 coefficient4 = _mm256_mul_ps(percentToLerpInY, percentToLerpInX);
__m256 newBlendedTexel_r = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_r), _mm256_mul_ps(coefficient2, texelB_r)),
_mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_r), _mm256_mul_ps(coefficient4, texelD_r)));
__m256 newBlendedTexel_g = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_g), _mm256_mul_ps(coefficient2, texelB_g)),
_mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_g), _mm256_mul_ps(coefficient4, texelD_g)));
__m256 newBlendedTexel_b = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_b), _mm256_mul_ps(coefficient2, texelB_b)),
_mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_b), _mm256_mul_ps(coefficient4, texelD_b)));
__m256 newBlendedTexel_a = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_a), _mm256_mul_ps(coefficient2, texelB_a)),
_mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_a), _mm256_mul_ps(coefficient4, texelD_a)));
//Linear blend (w/ pre multiplied alpha)
__m256 maxColorValue = _mm256_set1_ps(255.0f);
__m256 alphaBlend = _mm256_div_ps(newBlendedTexel_a, maxColorValue);
__m256 oneMinusAlphaBlend = _mm256_sub_ps(one, alphaBlend);
__m256 finalBlendedColor_r = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_r), newBlendedTexel_r);
__m256 finalBlendedColor_g = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_g), newBlendedTexel_g);
__m256 finalBlendedColor_b = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_b), newBlendedTexel_b);
__m256 finalBlendedColor_a = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_a), newBlendedTexel_a);
#if __AVX2__
{ //Convert and Pack into dest pixels to write out
__m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
__m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
__m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
__m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);
//Move pix*els (through bitwise operations and shifting) from RRRR GGGG etc. format to expected BGRA format
__m256i out = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_slli_epi32(finalBlendedColori_r, 16), _mm256_slli_epi32(finalBlendedColori_g, 8)), finalBlendedColori_b), _mm256_slli_epi32(finalBlendedColori_a, 24));
//Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
//the background color)
__m256i maskedOut = _mm256_or_si256(_mm256_and_si256(writeMask, out),
_mm256_andnot_si256(writeMask, backGroundPixels));
maskedOut = _mm256_or_si256(_mm256_and_si256(clipMask, maskedOut),
_mm256_andnot_si256(clipMask, *(__m256i*)destPixel));
*(__m256i*)destPixel = maskedOut;
};
#elif __AVX__
{ //Convert and Pack into dest pixels to write out
__m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
__m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
__m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
__m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);
__m256i backgroundColorsi_r = _mm256_cvtps_epi32(backgroundColors_r);
__m256i backgroundColorsi_g = _mm256_cvtps_epi32(backgroundColors_g);
__m256i backgroundColorsi_b = _mm256_cvtps_epi32(backgroundColors_b);
__m256i backgroundColorsi_a = _mm256_cvtps_epi32(backgroundColors_a);
//Since AVX doesn't have certain bitwise operations I need to extract 128 bit values from
//256 bit ones and then use the available bitwise operations on those
__m128i pixelSet1_r = _mm256_extractf128_si256(finalBlendedColori_r, 0);
__m128i pixelSet2_r = _mm256_extractf128_si256(finalBlendedColori_r, 1);
__m128i pixelSet1_g = _mm256_extractf128_si256(finalBlendedColori_g, 0);
__m128i pixelSet2_g = _mm256_extractf128_si256(finalBlendedColori_g, 1);
__m128i pixelSet1_b = _mm256_extractf128_si256(finalBlendedColori_b, 0);
__m128i pixelSet2_b = _mm256_extractf128_si256(finalBlendedColori_b, 1);
__m128i pixelSet1_a = _mm256_extractf128_si256(finalBlendedColori_a, 0);
__m128i pixelSet2_a = _mm256_extractf128_si256(finalBlendedColori_a, 1);
__m128i backgroundPixelSet1_r = _mm256_extractf128_si256(backgroundColorsi_r, 0);
__m128i backgroundPixelSet2_r = _mm256_extractf128_si256(backgroundColorsi_r, 1);
__m128i backgroundPixelSet1_g = _mm256_extractf128_si256(backgroundColorsi_g, 0);
__m128i backgroundPixelSet2_g = _mm256_extractf128_si256(backgroundColorsi_g, 1);
__m128i backgroundPixelSet1_b = _mm256_extractf128_si256(backgroundColorsi_b, 0);
__m128i backgroundPixelSet2_b = _mm256_extractf128_si256(backgroundColorsi_b, 1);
__m128i backgroundPixelSet1_a = _mm256_extractf128_si256(backgroundColorsi_a, 0);
__m128i backgroundPixelSet2_a = _mm256_extractf128_si256(backgroundColorsi_a, 1);
__m128i writeMaskSet1 = _mm256_extractf128_si256(writeMask, 0);
__m128i writeMaskSet2 = _mm256_extractf128_si256(writeMask, 1);
__m128i clipMaskSet1 = _mm256_extractf128_si256(clipMask, 0);
__m128i clipMaskSet2 = _mm256_extractf128_si256(clipMask, 1);
//Move pixels (through bitwise operations and shifting) from RRRR GGGG ... format to expected BGRA format
__m128i pixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet1_r, 16), _mm_slli_epi32(pixelSet1_g, 8)), pixelSet1_b), _mm_slli_epi32(pixelSet1_a, 24));
__m128i pixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet2_r, 16), _mm_slli_epi32(pixelSet2_g, 8)), pixelSet2_b), _mm_slli_epi32(pixelSet2_a, 24));
__m128i backgroundPixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet1_r, 16), _mm_slli_epi32(backgroundPixelSet1_g, 8)), backgroundPixelSet1_b), _mm_slli_epi32(backgroundPixelSet1_a, 24));
__m128i backgroundPixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet2_r, 16), _mm_slli_epi32(backgroundPixelSet2_g, 8)), backgroundPixelSet2_b), _mm_slli_epi32(backgroundPixelSet2_a, 24));
//Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
//the background color)
__m128i maskedOutSet1 = _mm_or_si128(_mm_and_si128(writeMaskSet1, pixels1Through4),
_mm_andnot_si128(writeMaskSet1, backgroundPixels1Through4));
__m128i maskedOutSet2 = _mm_or_si128(_mm_and_si128(writeMaskSet2, pixels5Through8),
_mm_andnot_si128(writeMaskSet2, backgroundPixels5Through8));
maskedOutSet1 = _mm_or_si128(_mm_and_si128(clipMaskSet1, maskedOutSet1),
_mm_andnot_si128(clipMaskSet1, *(__m128i*)destPixel));
maskedOutSet2 = _mm_or_si128(_mm_and_si128(clipMaskSet2, maskedOutSet2),
_mm_andnot_si128(clipMaskSet2, _mm256_extractf128_si256(*(__m256i*)destPixel, 1)));
//Pack 128 bit pixel values back into 256 bit values to write out
__m256i maskedOut = _mm256_castsi128_si256(maskedOutSet1);
maskedOut = _mm256_insertf128_si256(maskedOut, maskedOutSet2, 1);
*(__m256i*)destPixel = maskedOut;
};
#endif
destPixel += 8;
};
currentRow += colorBufferPitch;
};
};
|