Further optmizing software renderer

So I recently went through the optimization of the software renderer, finishing the multi-threaded portion. It seems Casey moves on to other thing from here. I was just wondering what kind of things I should do moving forward to continue to optimize the software renderer? I think Casey was discussing maybe some ways to avoid packing and unpacking pixels in the way we are currently rendering things by supplying pixel data in a more friendly SIMD format (RRRR GGGG BBBB instead of BRGA, etc.), though I'm not really sure how that would help (I have tried thinking of some ways to do this an havent come up with any good solutions yet). My current software renderering routine currently looks like this:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#include <immintrin.h>
void DrawTexture_Optimized(ui32* colorBufferData, v2i colorBufferSize, i32 colorBufferPitch, Quadf targetRect_screenCoords, RenderEntry_Texture image, Rectf clipRect)
{
    v2f origin = targetRect_screenCoords.bottomLeft;
    v2f targetRectXAxis = targetRect_screenCoords.bottomRight - targetRect_screenCoords.bottomLeft;
    v2f targetRectYAxis = targetRect_screenCoords.topLeft - targetRect_screenCoords.bottomLeft;

    i32 widthMax = (i32)clipRect.max.x;
    i32 heightMax = (i32)clipRect.max.y;

    i32 xMin = widthMax;
    i32 xMax = (i32)clipRect.min.x;
    i32 yMin = heightMax;
    i32 yMax = (i32)clipRect.min.y;

    { //Optimization to avoid iterating over every pixel on the screen - HH ep 92
        Array<v2f, 4> vecs = { origin, origin + targetRectXAxis, origin + targetRectXAxis + targetRectYAxis, origin + targetRectYAxis };
        for (i32 vecIndex = 0; vecIndex < vecs.Size(); ++vecIndex)
        {
            v2f testVec = vecs.At(vecIndex);
            i32 flooredX = FloorF32ToI32(testVec.x);
            i32 ceiledX = CeilF32ToI32(testVec.x);
            i32 flooredY = FloorF32ToI32(testVec.y);
            i32 ceiledY = CeilF32ToI32(testVec.y);

            if (xMin > flooredX)
                xMin = flooredX;
            if (yMin > flooredY)
                yMin = flooredY;
            if (xMax < ceiledX)
                xMax = ceiledX;
            if (yMax < ceiledY)
                yMax = ceiledY;
        }

        if (xMin < (i32)clipRect.min.x)
            xMin = (i32)clipRect.min.x;
        if (yMin < clipRect.min.y)
            yMin = (i32)clipRect.min.y;
        if (xMax > widthMax)
            xMax = widthMax;
        if (yMax > heightMax)
            yMax = heightMax;
    };

    i32 simdWidth_inBytes = 8;

    //Align to 8-byte boundry
    if ((xMin % simdWidth_inBytes) != 0)
        xMin = (i32)RoundDown((sizet)xMin, simdWidth_inBytes);

    //Pre calcuations for optimization
    f32 invertedXAxisSqd = 1.0f / MagnitudeSqd(targetRectXAxis);
    f32 invertedYAxisSqd = 1.0f / MagnitudeSqd(targetRectYAxis);
    i32 imageWidth = image.size.width - 3;
    i32 imageHeight = image.size.height - 3;
    v2f normalizedXAxis = invertedXAxisSqd * targetRectXAxis;
    v2f normalizedYAxis = invertedYAxisSqd * targetRectYAxis;

    i32 sizeOfPixel_inBytes = 4;
    ui8* currentRow = (ui8*)colorBufferData + (i32)xMin * sizeOfPixel_inBytes + (i32)yMin * colorBufferPitch;
    for (i32 screenY = yMin; screenY < yMax; ++screenY)
    {
        ui32* destPixel = (ui32*)currentRow;
        for (i32 screenX = xMin; screenX < xMax; screenX += simdWidth_inBytes)
        {
            //Initial setup variables for SIMD code
            __m256 one = _mm256_set1_ps(1.0f);
            __m256 zero = _mm256_set1_ps(0.0f);
            __m256 imgWidth = _mm256_set1_ps((f32)imageWidth);
            __m256 imgHeight = _mm256_set1_ps((f32)imageHeight);
            __m256 normalizedXAxis_x = _mm256_set1_ps(normalizedXAxis.x);
            __m256 normalizedXAxis_y = _mm256_set1_ps(normalizedXAxis.y);
            __m256 normalizedYAxis_x = _mm256_set1_ps(normalizedYAxis.x);
            __m256 normalizedYAxis_y = _mm256_set1_ps(normalizedYAxis.y);
            __m256 targetRectOrigin_x = _mm256_set1_ps(origin.x);
            __m256 targetRectOrigin_y = _mm256_set1_ps(origin.y);

            __m256 screenPixelCoords_x = _mm256_set_ps((f32)(screenX + 7), (f32)(screenX + 6), (f32)(screenX + 5), (f32)(screenX + 4), (f32)(screenX + 3), (f32)(screenX + 2), (f32)(screenX + 1), (f32)(screenX + 0));
            __m256 screenPixelCoords_y = _mm256_set1_ps((f32)screenY);

            __m256 uvRangeForTexture_u = _mm256_set1_ps(image.uvBounds.At(1).u - image.uvBounds.At(0).u);
            __m256 uvRangeForTexture_v = _mm256_set1_ps(image.uvBounds.At(1).v - image.uvBounds.At(0).v);

            __m256 minUVBounds_u = _mm256_set1_ps(image.uvBounds.At(0).u);
            __m256 minUVBounds_v = _mm256_set1_ps(image.uvBounds.At(0).v);

            //Gather normalized coordinates (uv's) in order to find the correct texel position below
            __m256 dXs = _mm256_sub_ps(screenPixelCoords_x, targetRectOrigin_x);
            __m256 dYs = _mm256_sub_ps(screenPixelCoords_y, targetRectOrigin_y);
            __m256 Us = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedXAxis_x), _mm256_mul_ps(dYs, normalizedXAxis_y));
            __m256 Vs = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedYAxis_x), _mm256_mul_ps(dYs, normalizedYAxis_y));

            /* clang-format off */
            //Using a mask to determine what colors final 8 wide pixel destintion buffer should except
            //(background texels or image texels). This replaces the need for a conditional
            __m256i writeMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(Us, zero, _CMP_GE_OQ),
                                                                  _mm256_cmp_ps(Us, one, _CMP_LE_OQ)),
                                                                  _mm256_and_ps(_mm256_cmp_ps(Vs, zero, _CMP_GE_OQ),
                                                                  _mm256_cmp_ps(Vs, one, _CMP_LE_OQ))));
            /* clang-format on */

            __m256i clipMask = _mm256_set1_epi32(0xFFFFFFFF);

            //See how much final 8 pixel wide dest buffer will expand past the max boundry of screen region (if at all)
            //and adjust it
            if (screenX > ((i32)widthMax - simdWidth_inBytes))
            {
                i32 diff = (i32)widthMax - (i32)screenX;
                i32 amountOfScreenOverflow = simdWidth_inBytes - diff;

                i32 index { 7 };
                while (amountOfScreenOverflow)
                {
                    clipMask.m256i_u32[index] = 0;
                    index -= 1;
                    --amountOfScreenOverflow;
                };
            };

            //Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            __m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
            for (i32 index {}; index < 8; ++index)
            {
                BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
                BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");

                //Gather 4 texels (in a square pattern) from certain texel Ptr
                ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
                sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
                sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
                sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
                sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
            };

#if __AVX2__
            //Unpack 4 sample texels to prepare for bilinear blend
            __m256i maskFF = _mm256_set1_epi32(0xFF);
            __m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
            __m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
            __m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
            __m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
            __m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
            __m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
            __m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
            __m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
            __m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
            __m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
            __m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
            __m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
            __m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
            __m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
            __m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
            __m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));

#elif __AVX__
            //Unpack 4 sample texels to prepare for bilinear blend
            __m256i* ptrToSampleTexelAs = &sampleTexelAs;
            __m256 texelA_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 28), (f32) * ((ui8*)ptrToSampleTexelAs + 24), (f32) * ((ui8*)ptrToSampleTexelAs + 20), (f32) * ((ui8*)ptrToSampleTexelAs + 16), (f32) * ((ui8*)ptrToSampleTexelAs + 12), (f32) * ((ui8*)ptrToSampleTexelAs + 8), (f32) * ((ui8*)ptrToSampleTexelAs + 4), (f32) * ((ui8*)ptrToSampleTexelAs + 0));
            __m256 texelA_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 29), (f32) * ((ui8*)ptrToSampleTexelAs + 25), (f32) * ((ui8*)ptrToSampleTexelAs + 21), (f32) * ((ui8*)ptrToSampleTexelAs + 17), (f32) * ((ui8*)ptrToSampleTexelAs + 13), (f32) * ((ui8*)ptrToSampleTexelAs + 9), (f32) * ((ui8*)ptrToSampleTexelAs + 5), (f32) * ((ui8*)ptrToSampleTexelAs + 1));
            __m256 texelA_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 30), (f32) * ((ui8*)ptrToSampleTexelAs + 26), (f32) * ((ui8*)ptrToSampleTexelAs + 22), (f32) * ((ui8*)ptrToSampleTexelAs + 18), (f32) * ((ui8*)ptrToSampleTexelAs + 14), (f32) * ((ui8*)ptrToSampleTexelAs + 10), (f32) * ((ui8*)ptrToSampleTexelAs + 6), (f32) * ((ui8*)ptrToSampleTexelAs + 2));
            __m256 texelA_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 31), (f32) * ((ui8*)ptrToSampleTexelAs + 27), (f32) * ((ui8*)ptrToSampleTexelAs + 23), (f32) * ((ui8*)ptrToSampleTexelAs + 19), (f32) * ((ui8*)ptrToSampleTexelAs + 15), (f32) * ((ui8*)ptrToSampleTexelAs + 11), (f32) * ((ui8*)ptrToSampleTexelAs + 7), (f32) * ((ui8*)ptrToSampleTexelAs + 3));

            __m256i* ptrToSampleTexelBs = &sampleTexelBs;
            __m256 texelB_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 28), (f32) * ((ui8*)ptrToSampleTexelBs + 24), (f32) * ((ui8*)ptrToSampleTexelBs + 20), (f32) * ((ui8*)ptrToSampleTexelBs + 16), (f32) * ((ui8*)ptrToSampleTexelBs + 12), (f32) * ((ui8*)ptrToSampleTexelBs + 8), (f32) * ((ui8*)ptrToSampleTexelBs + 4), (f32) * ((ui8*)ptrToSampleTexelBs + 0));
            __m256 texelB_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 29), (f32) * ((ui8*)ptrToSampleTexelBs + 25), (f32) * ((ui8*)ptrToSampleTexelBs + 21), (f32) * ((ui8*)ptrToSampleTexelBs + 17), (f32) * ((ui8*)ptrToSampleTexelBs + 13), (f32) * ((ui8*)ptrToSampleTexelBs + 9), (f32) * ((ui8*)ptrToSampleTexelBs + 5), (f32) * ((ui8*)ptrToSampleTexelBs + 1));
            __m256 texelB_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 30), (f32) * ((ui8*)ptrToSampleTexelBs + 26), (f32) * ((ui8*)ptrToSampleTexelBs + 22), (f32) * ((ui8*)ptrToSampleTexelBs + 18), (f32) * ((ui8*)ptrToSampleTexelBs + 14), (f32) * ((ui8*)ptrToSampleTexelBs + 10), (f32) * ((ui8*)ptrToSampleTexelBs + 6), (f32) * ((ui8*)ptrToSampleTexelBs + 2));
            __m256 texelB_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 31), (f32) * ((ui8*)ptrToSampleTexelBs + 27), (f32) * ((ui8*)ptrToSampleTexelBs + 23), (f32) * ((ui8*)ptrToSampleTexelBs + 19), (f32) * ((ui8*)ptrToSampleTexelBs + 15), (f32) * ((ui8*)ptrToSampleTexelBs + 11), (f32) * ((ui8*)ptrToSampleTexelBs + 7), (f32) * ((ui8*)ptrToSampleTexelBs + 3));

            __m256i* ptrToSampleTexelCs = &sampleTexelCs;
            __m256 texelC_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 28), (f32) * ((ui8*)ptrToSampleTexelCs + 24), (f32) * ((ui8*)ptrToSampleTexelCs + 20), (f32) * ((ui8*)ptrToSampleTexelCs + 16), (f32) * ((ui8*)ptrToSampleTexelCs + 12), (f32) * ((ui8*)ptrToSampleTexelCs + 8), (f32) * ((ui8*)ptrToSampleTexelCs + 4), (f32) * ((ui8*)ptrToSampleTexelCs + 0));
            __m256 texelC_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 29), (f32) * ((ui8*)ptrToSampleTexelCs + 25), (f32) * ((ui8*)ptrToSampleTexelCs + 21), (f32) * ((ui8*)ptrToSampleTexelCs + 17), (f32) * ((ui8*)ptrToSampleTexelCs + 13), (f32) * ((ui8*)ptrToSampleTexelCs + 9), (f32) * ((ui8*)ptrToSampleTexelCs + 5), (f32) * ((ui8*)ptrToSampleTexelCs + 1));
            __m256 texelC_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 30), (f32) * ((ui8*)ptrToSampleTexelCs + 26), (f32) * ((ui8*)ptrToSampleTexelCs + 22), (f32) * ((ui8*)ptrToSampleTexelCs + 18), (f32) * ((ui8*)ptrToSampleTexelCs + 14), (f32) * ((ui8*)ptrToSampleTexelCs + 10), (f32) * ((ui8*)ptrToSampleTexelCs + 6), (f32) * ((ui8*)ptrToSampleTexelCs + 2));
            __m256 texelC_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 31), (f32) * ((ui8*)ptrToSampleTexelCs + 27), (f32) * ((ui8*)ptrToSampleTexelCs + 23), (f32) * ((ui8*)ptrToSampleTexelCs + 19), (f32) * ((ui8*)ptrToSampleTexelCs + 15), (f32) * ((ui8*)ptrToSampleTexelCs + 11), (f32) * ((ui8*)ptrToSampleTexelCs + 7), (f32) * ((ui8*)ptrToSampleTexelCs + 3));

            __m256i* ptrToSampleTexelDs = &sampleTexelDs;
            __m256 texelD_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 28), (f32) * ((ui8*)ptrToSampleTexelDs + 24), (f32) * ((ui8*)ptrToSampleTexelDs + 20), (f32) * ((ui8*)ptrToSampleTexelDs + 16), (f32) * ((ui8*)ptrToSampleTexelDs + 12), (f32) * ((ui8*)ptrToSampleTexelDs + 8), (f32) * ((ui8*)ptrToSampleTexelDs + 4), (f32) * ((ui8*)ptrToSampleTexelDs + 0));
            __m256 texelD_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 29), (f32) * ((ui8*)ptrToSampleTexelDs + 25), (f32) * ((ui8*)ptrToSampleTexelDs + 21), (f32) * ((ui8*)ptrToSampleTexelDs + 17), (f32) * ((ui8*)ptrToSampleTexelDs + 13), (f32) * ((ui8*)ptrToSampleTexelDs + 9), (f32) * ((ui8*)ptrToSampleTexelDs + 5), (f32) * ((ui8*)ptrToSampleTexelDs + 1));
            __m256 texelD_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 30), (f32) * ((ui8*)ptrToSampleTexelDs + 26), (f32) * ((ui8*)ptrToSampleTexelDs + 22), (f32) * ((ui8*)ptrToSampleTexelDs + 18), (f32) * ((ui8*)ptrToSampleTexelDs + 14), (f32) * ((ui8*)ptrToSampleTexelDs + 10), (f32) * ((ui8*)ptrToSampleTexelDs + 6), (f32) * ((ui8*)ptrToSampleTexelDs + 2));
            __m256 texelD_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 31), (f32) * ((ui8*)ptrToSampleTexelDs + 27), (f32) * ((ui8*)ptrToSampleTexelDs + 23), (f32) * ((ui8*)ptrToSampleTexelDs + 19), (f32) * ((ui8*)ptrToSampleTexelDs + 15), (f32) * ((ui8*)ptrToSampleTexelDs + 11), (f32) * ((ui8*)ptrToSampleTexelDs + 7), (f32) * ((ui8*)ptrToSampleTexelDs + 3));

            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256i* ptrToBackgroundPixels = &backGroundPixels;
            __m256 backgroundColors_b = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 28), (f32) * ((ui8*)ptrToBackgroundPixels + 24), (f32) * ((ui8*)ptrToBackgroundPixels + 20), (f32) * ((ui8*)ptrToBackgroundPixels + 16), (f32) * ((ui8*)ptrToBackgroundPixels + 12), (f32) * ((ui8*)ptrToBackgroundPixels + 8), (f32) * ((ui8*)ptrToBackgroundPixels + 4), (f32) * ((ui8*)ptrToBackgroundPixels + 0));
            __m256 backgroundColors_g = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 29), (f32) * ((ui8*)ptrToBackgroundPixels + 25), (f32) * ((ui8*)ptrToBackgroundPixels + 21), (f32) * ((ui8*)ptrToBackgroundPixels + 17), (f32) * ((ui8*)ptrToBackgroundPixels + 13), (f32) * ((ui8*)ptrToBackgroundPixels + 9), (f32) * ((ui8*)ptrToBackgroundPixels + 5), (f32) * ((ui8*)ptrToBackgroundPixels + 1));
            __m256 backgroundColors_r = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 30), (f32) * ((ui8*)ptrToBackgroundPixels + 26), (f32) * ((ui8*)ptrToBackgroundPixels + 22), (f32) * ((ui8*)ptrToBackgroundPixels + 18), (f32) * ((ui8*)ptrToBackgroundPixels + 14), (f32) * ((ui8*)ptrToBackgroundPixels + 10), (f32) * ((ui8*)ptrToBackgroundPixels + 6), (f32) * ((ui8*)ptrToBackgroundPixels + 2));
            __m256 backgroundColors_a = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 31), (f32) * ((ui8*)ptrToBackgroundPixels + 27), (f32) * ((ui8*)ptrToBackgroundPixels + 23), (f32) * ((ui8*)ptrToBackgroundPixels + 19), (f32) * ((ui8*)ptrToBackgroundPixels + 15), (f32) * ((ui8*)ptrToBackgroundPixels + 11), (f32) * ((ui8*)ptrToBackgroundPixels + 7), (f32) * ((ui8*)ptrToBackgroundPixels + 3));
#endif

            //Bilinear blend
            __m256 percentToLerpInX = _mm256_sub_ps(texelCoords_x, _mm256_floor_ps(texelCoords_x));
            __m256 percentToLerpInY = _mm256_sub_ps(texelCoords_y, _mm256_floor_ps(texelCoords_y));
            __m256 oneMinusXLerp = _mm256_sub_ps(one, percentToLerpInX);
            __m256 oneMinusYLerp = _mm256_sub_ps(one, percentToLerpInY);
            __m256 coefficient1 = _mm256_mul_ps(oneMinusYLerp, oneMinusXLerp);
            __m256 coefficient2 = _mm256_mul_ps(oneMinusYLerp, percentToLerpInX);
            __m256 coefficient3 = _mm256_mul_ps(percentToLerpInY, oneMinusXLerp);
            __m256 coefficient4 = _mm256_mul_ps(percentToLerpInY, percentToLerpInX);
            __m256 newBlendedTexel_r = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_r), _mm256_mul_ps(coefficient2, texelB_r)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_r), _mm256_mul_ps(coefficient4, texelD_r)));
            __m256 newBlendedTexel_g = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_g), _mm256_mul_ps(coefficient2, texelB_g)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_g), _mm256_mul_ps(coefficient4, texelD_g)));
            __m256 newBlendedTexel_b = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_b), _mm256_mul_ps(coefficient2, texelB_b)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_b), _mm256_mul_ps(coefficient4, texelD_b)));
            __m256 newBlendedTexel_a = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_a), _mm256_mul_ps(coefficient2, texelB_a)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_a), _mm256_mul_ps(coefficient4, texelD_a)));

            //Linear blend (w/ pre multiplied alpha)
            __m256 maxColorValue = _mm256_set1_ps(255.0f);
            __m256 alphaBlend = _mm256_div_ps(newBlendedTexel_a, maxColorValue);
            __m256 oneMinusAlphaBlend = _mm256_sub_ps(one, alphaBlend);
            __m256 finalBlendedColor_r = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_r), newBlendedTexel_r);
            __m256 finalBlendedColor_g = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_g), newBlendedTexel_g);
            __m256 finalBlendedColor_b = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_b), newBlendedTexel_b);
            __m256 finalBlendedColor_a = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_a), newBlendedTexel_a);

#if __AVX2__
            { //Convert and Pack into dest pixels to write out
                __m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
                __m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
                __m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
                __m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);

                //Move pix*els (through bitwise operations and shifting) from RRRR GGGG etc. format to expected BGRA format
                __m256i out = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_slli_epi32(finalBlendedColori_r, 16), _mm256_slli_epi32(finalBlendedColori_g, 8)), finalBlendedColori_b), _mm256_slli_epi32(finalBlendedColori_a, 24));

                //Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
                //the background color)
                __m256i maskedOut = _mm256_or_si256(_mm256_and_si256(writeMask, out),
                    _mm256_andnot_si256(writeMask, backGroundPixels));

                maskedOut = _mm256_or_si256(_mm256_and_si256(clipMask, maskedOut),
                    _mm256_andnot_si256(clipMask, *(__m256i*)destPixel));

                *(__m256i*)destPixel = maskedOut;
            };

#elif __AVX__
            { //Convert and Pack into dest pixels to write out
                __m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
                __m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
                __m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
                __m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);

                __m256i backgroundColorsi_r = _mm256_cvtps_epi32(backgroundColors_r);
                __m256i backgroundColorsi_g = _mm256_cvtps_epi32(backgroundColors_g);
                __m256i backgroundColorsi_b = _mm256_cvtps_epi32(backgroundColors_b);
                __m256i backgroundColorsi_a = _mm256_cvtps_epi32(backgroundColors_a);

                //Since AVX doesn't have certain bitwise operations I need to extract 128 bit values from
                //256 bit ones and then use the available bitwise operations on those
                __m128i pixelSet1_r = _mm256_extractf128_si256(finalBlendedColori_r, 0);
                __m128i pixelSet2_r = _mm256_extractf128_si256(finalBlendedColori_r, 1);
                __m128i pixelSet1_g = _mm256_extractf128_si256(finalBlendedColori_g, 0);
                __m128i pixelSet2_g = _mm256_extractf128_si256(finalBlendedColori_g, 1);
                __m128i pixelSet1_b = _mm256_extractf128_si256(finalBlendedColori_b, 0);
                __m128i pixelSet2_b = _mm256_extractf128_si256(finalBlendedColori_b, 1);
                __m128i pixelSet1_a = _mm256_extractf128_si256(finalBlendedColori_a, 0);
                __m128i pixelSet2_a = _mm256_extractf128_si256(finalBlendedColori_a, 1);
                __m128i backgroundPixelSet1_r = _mm256_extractf128_si256(backgroundColorsi_r, 0);
                __m128i backgroundPixelSet2_r = _mm256_extractf128_si256(backgroundColorsi_r, 1);
                __m128i backgroundPixelSet1_g = _mm256_extractf128_si256(backgroundColorsi_g, 0);
                __m128i backgroundPixelSet2_g = _mm256_extractf128_si256(backgroundColorsi_g, 1);
                __m128i backgroundPixelSet1_b = _mm256_extractf128_si256(backgroundColorsi_b, 0);
                __m128i backgroundPixelSet2_b = _mm256_extractf128_si256(backgroundColorsi_b, 1);
                __m128i backgroundPixelSet1_a = _mm256_extractf128_si256(backgroundColorsi_a, 0);
                __m128i backgroundPixelSet2_a = _mm256_extractf128_si256(backgroundColorsi_a, 1);
                __m128i writeMaskSet1 = _mm256_extractf128_si256(writeMask, 0);
                __m128i writeMaskSet2 = _mm256_extractf128_si256(writeMask, 1);
                __m128i clipMaskSet1 = _mm256_extractf128_si256(clipMask, 0);
                __m128i clipMaskSet2 = _mm256_extractf128_si256(clipMask, 1);

                //Move pixels (through bitwise operations and shifting) from RRRR GGGG ... format to expected BGRA format
                __m128i pixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet1_r, 16), _mm_slli_epi32(pixelSet1_g, 8)), pixelSet1_b), _mm_slli_epi32(pixelSet1_a, 24));
                __m128i pixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet2_r, 16), _mm_slli_epi32(pixelSet2_g, 8)), pixelSet2_b), _mm_slli_epi32(pixelSet2_a, 24));
                __m128i backgroundPixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet1_r, 16), _mm_slli_epi32(backgroundPixelSet1_g, 8)), backgroundPixelSet1_b), _mm_slli_epi32(backgroundPixelSet1_a, 24));
                __m128i backgroundPixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet2_r, 16), _mm_slli_epi32(backgroundPixelSet2_g, 8)), backgroundPixelSet2_b), _mm_slli_epi32(backgroundPixelSet2_a, 24));

                //Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
                //the background color)
                __m128i maskedOutSet1 = _mm_or_si128(_mm_and_si128(writeMaskSet1, pixels1Through4),
                    _mm_andnot_si128(writeMaskSet1, backgroundPixels1Through4));

                __m128i maskedOutSet2 = _mm_or_si128(_mm_and_si128(writeMaskSet2, pixels5Through8),
                    _mm_andnot_si128(writeMaskSet2, backgroundPixels5Through8));

                maskedOutSet1 = _mm_or_si128(_mm_and_si128(clipMaskSet1, maskedOutSet1),
                    _mm_andnot_si128(clipMaskSet1, *(__m128i*)destPixel));

                maskedOutSet2 = _mm_or_si128(_mm_and_si128(clipMaskSet2, maskedOutSet2),
                    _mm_andnot_si128(clipMaskSet2, _mm256_extractf128_si256(*(__m256i*)destPixel, 1)));

                //Pack 128 bit pixel values back into 256 bit values to write out
                __m256i maskedOut = _mm256_castsi128_si256(maskedOutSet1);
                maskedOut = _mm256_insertf128_si256(maskedOut, maskedOutSet2, 1);

                *(__m256i*)destPixel = maskedOut;
            };

#endif

            destPixel += 8;
        };

        currentRow += colorBufferPitch;
    };
};

Edited by Jason on Reason: Initial post
boagz57
though I'm not really sure how that would help

It will help avoid doing all those shifts & and's to extract r/g/b values and also avoid doing shifts & or's to pack for storing.
With one load you'll load all reds (or greens/blues). All you need to do is expand byte to float. Or put it back for writing to memory.

Other optimization possibility is to avoid float's. Do everything in 8 bits or 16 bits integer in intermediate result (using extra registers). That should allow you to process 256/8 = 32 pixels in one iteration, not just 8.

Another common approach in CPU rasterizer (and even in GPU implementation) is to store memory in tiled order. Instead of storing image as line-by line, store them as 4x4 chunks (or 8x8, or 16x8, need to try and measure what's better). This way loading pixels for bilinear blending will give better cache locality.

Find some articles about how Larabee was doing rasterization, it may have some good ideas to apply to your case.
Awesome, thanks for those suggestions. I am going to try and dive deep into this problem and see what I can do. I will probably post back in this thread at some point if I have another question or I might just show current progress.
So I've been working with my current software renderer and I'm trying to see if I can optimize things using the first method suggest by Martins where I send pixel colors to renderer in the format BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA (assuming 8-wide simd instructions) instead of BGRA BRGA BGRA etc. I have done a preliminary pass and have been able to get things to run again with above format but with incorrect colors outputting to the screen.

Original botteneck'd code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
//Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            __m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
            for (i32 index {}; index < 8; ++index)
            {
                BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
                BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");

                //Gather 4 texels (in a square pattern) from certain texel Ptr
                ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
                sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
                sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
                sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
                sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
            };

#if __AVX2__
            //Unpack 4 sample texels to prepare for bilinear blend. Have to do a bunch of bit shifting/masking here
            __m256i maskFF = _mm256_set1_epi32(0xFF);
            __m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
            __m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
            __m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
            __m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
            __m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
            __m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
            __m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
            __m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
            __m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
            __m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
            __m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
            __m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
            __m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
            __m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
            __m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
            __m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));


New, optimized code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
//Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            BGZ_ASSERT((texelCoords_x.m256_f32[0] >= 0) && (texelCoords_x.m256_f32[0] <= (i32)image.size.width), "x coord is out of range!: ");
            BGZ_ASSERT((texelCoords_y.m256_f32[0] >= 0) && (texelCoords_y.m256_f32[0] <= (i32)image.size.height), "y coord is out of range!");

            //Gather 4 texels (in a square pattern) from certain texel Ptr
            ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[0] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[0] * sizeof(ui32)); //size of pixel
            __m256i* texelA = (__m256i*)(texelPtr);
            __m256i* texelB = (__m256i*)(texelPtr + sizeof(__m256i));
            __m256i* texelC = (__m256i*)(texelPtr + image.pitch_pxls);
            __m256i* texelD = (__m256i*)(texelPtr + image.pitch_pxls + sizeof(__m256i));

            __m256i sampleTexelAs_b = _mm256_set_epi32((ui32)(*((ui8*)texelA + 0)), (ui32)(*((ui8*)texelA + 1)), (ui32)(*((ui8*)texelA + 2)), (ui32)(*((ui8*)texelA + 3)), (ui32)(*((ui8*)texelA + 4)), (ui32)(*((ui8*)texelA + 5)), (ui32)(*((ui8*)texelA + 6)), (ui32)(*((ui8*)texelA + 7)));
            __m256i sampleTexelBs_b = _mm256_set_epi32((ui32)(*((ui8*)texelB + 0)), (ui32)(*((ui8*)texelB + 1)), (ui32)(*((ui8*)texelB + 2)), (ui32)(*((ui8*)texelB + 3)), (ui32)(*((ui8*)texelB + 4)), (ui32)(*((ui8*)texelB + 5)), (ui32)(*((ui8*)texelB + 6)), (ui32)(*((ui8*)texelB + 7)));
            __m256i sampleTexelCs_b = _mm256_set_epi32((ui32)(*((ui8*)texelC + 0)), (ui32)(*((ui8*)texelC + 1)), (ui32)(*((ui8*)texelC + 2)), (ui32)(*((ui8*)texelC + 3)), (ui32)(*((ui8*)texelC + 4)), (ui32)(*((ui8*)texelC + 5)), (ui32)(*((ui8*)texelC + 6)), (ui32)(*((ui8*)texelC + 7)));
            __m256i sampleTexelDs_b = _mm256_set_epi32((ui32)(*((ui8*)texelD + 0)), (ui32)(*((ui8*)texelD + 1)), (ui32)(*((ui8*)texelD + 2)), (ui32)(*((ui8*)texelD + 3)), (ui32)(*((ui8*)texelD + 4)), (ui32)(*((ui8*)texelD + 5)), (ui32)(*((ui8*)texelD + 6)), (ui32)(*((ui8*)texelD + 7)));

            __m256i sampleTexelAs_g = _mm256_set_epi32((ui32)(*((ui8*)texelA + 8)), (ui32)(*((ui8*)texelA + 9)), (ui32)(*((ui8*)texelA + 10)), (ui32)(*((ui8*)texelA + 11)), (ui32)(*((ui8*)texelA + 12)), (ui32)(*((ui8*)texelA + 13)), (ui32)(*((ui8*)texelA + 14)), (ui32)(*((ui8*)texelA + 15)));
            __m256i sampleTexelBs_g = _mm256_set_epi32((ui32)(*((ui8*)texelB + 8)), (ui32)(*((ui8*)texelB + 9)), (ui32)(*((ui8*)texelB + 10)), (ui32)(*((ui8*)texelB + 11)), (ui32)(*((ui8*)texelB + 12)), (ui32)(*((ui8*)texelB + 13)), (ui32)(*((ui8*)texelB + 14)), (ui32)(*((ui8*)texelB + 15)));
            __m256i sampleTexelCs_g = _mm256_set_epi32((ui32)(*((ui8*)texelC + 8)), (ui32)(*((ui8*)texelC + 9)), (ui32)(*((ui8*)texelC + 10)), (ui32)(*((ui8*)texelC + 11)), (ui32)(*((ui8*)texelC + 12)), (ui32)(*((ui8*)texelC + 13)), (ui32)(*((ui8*)texelC + 14)), (ui32)(*((ui8*)texelC + 15)));
            __m256i sampleTexelDs_g = _mm256_set_epi32((ui32)(*((ui8*)texelD + 8)), (ui32)(*((ui8*)texelD + 9)), (ui32)(*((ui8*)texelD + 10)), (ui32)(*((ui8*)texelD + 11)), (ui32)(*((ui8*)texelD + 12)), (ui32)(*((ui8*)texelD + 13)), (ui32)(*((ui8*)texelD + 14)), (ui32)(*((ui8*)texelD + 15)));

            __m256i sampleTexelAs_r = _mm256_set_epi32((ui32)(*((ui8*)texelA + 16)), (ui32)(*((ui8*)texelA + 17)), (ui32)(*((ui8*)texelA + 18)), (ui32)(*((ui8*)texelA + 19)), (ui32)(*((ui8*)texelA + 20)), (ui32)(*((ui8*)texelA + 21)), (ui32)(*((ui8*)texelA + 22)), (ui32)(*((ui8*)texelA + 23)));
            __m256i sampleTexelBs_r = _mm256_set_epi32((ui32)(*((ui8*)texelB + 16)), (ui32)(*((ui8*)texelB + 17)), (ui32)(*((ui8*)texelB + 18)), (ui32)(*((ui8*)texelB + 19)), (ui32)(*((ui8*)texelB + 20)), (ui32)(*((ui8*)texelB + 21)), (ui32)(*((ui8*)texelB + 22)), (ui32)(*((ui8*)texelB + 23)));
            __m256i sampleTexelCs_r = _mm256_set_epi32((ui32)(*((ui8*)texelC + 16)), (ui32)(*((ui8*)texelC + 17)), (ui32)(*((ui8*)texelC + 18)), (ui32)(*((ui8*)texelC + 19)), (ui32)(*((ui8*)texelC + 20)), (ui32)(*((ui8*)texelC + 21)), (ui32)(*((ui8*)texelC + 22)), (ui32)(*((ui8*)texelC + 23)));
            __m256i sampleTexelDs_r = _mm256_set_epi32((ui32)(*((ui8*)texelD + 16)), (ui32)(*((ui8*)texelD + 17)), (ui32)(*((ui8*)texelD + 18)), (ui32)(*((ui8*)texelD + 19)), (ui32)(*((ui8*)texelD + 20)), (ui32)(*((ui8*)texelD + 21)), (ui32)(*((ui8*)texelD + 22)), (ui32)(*((ui8*)texelD + 23)));

            __m256i sampleTexelAs_a = _mm256_set_epi32((ui32)(*((ui8*)texelA + 24)), (ui32)(*((ui8*)texelA + 25)), (ui32)(*((ui8*)texelA + 26)), (ui32)(*((ui8*)texelA + 27)), (ui32)(*((ui8*)texelA + 28)), (ui32)(*((ui8*)texelA + 29)), (ui32)(*((ui8*)texelA + 30)), (ui32)(*((ui8*)texelA + 31)));
            __m256i sampleTexelBs_a = _mm256_set_epi32((ui32)(*((ui8*)texelB + 24)), (ui32)(*((ui8*)texelB + 25)), (ui32)(*((ui8*)texelB + 26)), (ui32)(*((ui8*)texelB + 27)), (ui32)(*((ui8*)texelB + 28)), (ui32)(*((ui8*)texelB + 29)), (ui32)(*((ui8*)texelB + 30)), (ui32)(*((ui8*)texelB + 31)));
            __m256i sampleTexelCs_a = _mm256_set_epi32((ui32)(*((ui8*)texelC + 24)), (ui32)(*((ui8*)texelC + 25)), (ui32)(*((ui8*)texelC + 26)), (ui32)(*((ui8*)texelC + 27)), (ui32)(*((ui8*)texelC + 28)), (ui32)(*((ui8*)texelC + 29)), (ui32)(*((ui8*)texelC + 30)), (ui32)(*((ui8*)texelC + 31)));
            __m256i sampleTexelDs_a = _mm256_set_epi32((ui32)(*((ui8*)texelD + 24)), (ui32)(*((ui8*)texelD + 25)), (ui32)(*((ui8*)texelD + 26)), (ui32)(*((ui8*)texelD + 27)), (ui32)(*((ui8*)texelD + 28)), (ui32)(*((ui8*)texelD + 29)), (ui32)(*((ui8*)texelD + 30)), (ui32)(*((ui8*)texelD + 31)));

#if __AVX2__
            __m256i maskFF = _mm256_set1_epi32(0xFF);

            //Unpack 4 sample texels to prepare for bilinear blend. NO more bit shifting/masking needed
            __m256 texelA_b = _mm256_cvtepi32_ps(sampleTexelAs_b);
            __m256 texelB_b = _mm256_cvtepi32_ps(sampleTexelBs_b);
            __m256 texelC_b = _mm256_cvtepi32_ps(sampleTexelCs_b);
            __m256 texelD_b = _mm256_cvtepi32_ps(sampleTexelDs_b);

            __m256 texelA_g = _mm256_cvtepi32_ps(sampleTexelAs_g);
            __m256 texelB_g = _mm256_cvtepi32_ps(sampleTexelBs_g);
            __m256 texelC_g = _mm256_cvtepi32_ps(sampleTexelCs_g);
            __m256 texelD_g = _mm256_cvtepi32_ps(sampleTexelDs_g);

            __m256 texelA_r = _mm256_cvtepi32_ps(sampleTexelAs_r);
            __m256 texelB_r = _mm256_cvtepi32_ps(sampleTexelBs_r);
            __m256 texelC_r = _mm256_cvtepi32_ps(sampleTexelCs_r);
            __m256 texelD_r = _mm256_cvtepi32_ps(sampleTexelDs_r);

            __m256 texelA_a = _mm256_cvtepi32_ps(sampleTexelAs_a);
            __m256 texelB_a = _mm256_cvtepi32_ps(sampleTexelBs_a);
            __m256 texelC_a = _mm256_cvtepi32_ps(sampleTexelCs_a);
            __m256 texelD_a = _mm256_cvtepi32_ps(sampleTexelDs_a);

            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));



Things are definitely faster but of course I have to now get colors to output correctly. The issue I'm running into has to do with texture UVs. I think I could get things to work properly with a speed improvement if I assume a non-rotated image. This is because my v's of each uv calculated will be the same, meaning my texel y positions will all be the same for a given set of 8 texel x positions (assuming image pitch is a factor of 8). However, when a texture is rotated then I would have situations where the texel y positions of a row of 8 pixels will be different, meaning I would have to grab texel colors from different rows of the original image. Thus I couldn't just load in all blue's green's red's and alpha's with no hassel (I would have to go back to bitshifting/masking again to get colors properly prepared for bi-linear filtering, which is what I was doing with the original BGRA format in the first place). I was wondering if anyone had any ideas as to how to go about fixing this issue or had any suggestions on how to think about the problem.
Not sure how calculating u/v's are affected by way how you load and store color values in SIMD register. Does your code work with "old" way of storing and processing pixels? If so, it should work with "new" changed way. There should be no differences. Just make sure values calculated for u/v's are the same.

Btw instead of doing so many byte loads + _mm256_set_epi32 instructions, do one load with _mm256_lddqu_si256 to load 32 bytes. Then do shuffle/unpack operations take each 8 bytes and expand them into 4x8 32-bit values - four registers each with 8 32-bit values.
So I'm trying mmozeiko's suggestion of processing 32 pixels per iteration of my pixel loop. Right now, I currently have my image colors stored so that each texel has it's corresponding texels needed for bilinear blending are stored right next to it. I also shift the colors around to be bbbb gggg rrrr aaaa. So in this way:

Pseudo code

1
ui8* image = [(px1b px2b px3b px4b), (px1g pxl2g pxl3g pxl4g), (pxl1r pxl2r pxl3r pxl4r), (pxl1a pxl2a pxl3a pxl4a), (pxl5b pxl6b pxl7b..).....]


When I get to the renderer I eventually pack everything into a simd register by color (each register stores 32 8-bit colors). I'm wondering if there is an efficient way using simd instructions to take a 256i variable packed with 32 pixel colors (so one simd register would hold all colors of say blue - bbbb, bbbb, bbbb, bbbb, .....) and expand each 8-bit color value into a corresponding float value:

e.g.
Pseudo Code

1
256i sampleTexels_blue = [(b1 b2 b3 b4), (b5 b6 b7 b8), (b9 b10 b11 b12),  (b13 b14 b15 b16),  .......b32];


expanded to something like

1
2
3
4
256 sampleTexels_A_blue = [b1 float, b5 float, b9 float, b13 float, b17 float, b21 float, b25 float, b29 float];
256 sampleTexels_B_blue = [b2 float, b6 float, b10 float, b14 float, b18 float, b22 float, b26 float, b30 float];
256 sampleTexels_C_blue = ......;
256 sampleTexels_D_blue = .....b32 float];


I need to do this because I need to do floating point multiplication for the bi-linear blending, more specifically when multiplying the coefficient's and each set of texel colors:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
            __m256 newBlendedTexel_b = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, sampleTexelAs_blue), _mm256_mul_ps(coefficient2, sampleTexelBs_blue)),
                                                     _mm256_add_ps(_mm256_mul_ps(coefficient3, sampleTexelBs_blue), _mm256_mul_ps(coefficient4, sampleTexelDs_blue)));

            __m256 newBlendedTexel_g = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, sampleTexelAs_green), _mm256_mul_ps(coefficient2, sampleTexelBs_green)),
                                                     _mm256_add_ps(_mm256_mul_ps(coefficient3, sampleTexelDs_green), _mm256_mul_ps(coefficient4, sampleTexelDs_green)));

            __m256 newBlendedTexel_r = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, sampleTexelAs_red), _mm256_mul_ps(coefficient2, sampleTexelBs_red)),
                                                     _mm256_add_ps(_mm256_mul_ps(coefficient3, sampleTexelCs_red), _mm256_mul_ps(coefficient4, sampleTexelDs_red)));

.......................


I've tried fiddling with the suffle_epi8 instruction but I'm confused on how exactly it works.

Edited by Jason on
You just do bunch of unpacks / packs:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
zero = _mm256_zeroall(); // 0

// blue = 123456789abcdefghijklmnopqrstuvw

b0 = _mm256_unpacklo_epi8(zero, blue);  // 0102030405060708090a0b0c0d0e0f0g
b1 = _mm256_unpackhi_epi8(zero, blue); // 0h0i0j0k0l0m0n0o0p0q0r0s0t0u0v0w

b00 = _mm256_unpacklo_epi16(zero, b0); // 0001 0002 0003 0004 0005 0006 0007 0008
b01 = _mm256_unpackhi_epi16(zero, b0); // 0009 000a 000b 000c 000d 000e 000f 000g
b10 = _mm256_unpacklo_epi16(zero, b1); // 000h 000i 000j 000k 000l 000m 000n 000o
b11 = _mm256_unpackhi_epi16(zero, b1); // 000p 000q 000r 000s 000t 000u 000v 000w

t0 = _mm256_unpacklo_epi32(b00, b01); // 0001 0009 0002 000a 0003 000b 0004 000c
t1 = _mm256_unpackhi_epi32(b00, b01); // 0005 000d 0006 000e 0007 000f 0008 000g
t3 = _mm256_unpacklo_epi32(b10, b11); // 000h 000p 000i 000q 000j 000r 000k 000s
t4 = _mm256_unpackhi_epi32(b10, b11); // 000l 000t 000m 000u 000n 000v 000o 000w

t5 = _mm256_unpacklo_epi32(t0, t1); // 0001 0005 0009 000d 0002 0006 000a 000e
t6 = _mm256_unpackhi_epi32(t0, t1); // 0003 0007 000b 000f 0004 0008 000c 000g
t7 = _mm256_unpacklo_epi32(t3, t4); // 000h 000l 000p 000t 000i 000m 000q 000u
t8 = _mm256_unpackhi_epi32(t3, t4); //  000j 000n 000r 000v 000k 000o 000s 000w

tA = _mm256_permute2x128_si256(t5, t7, 0 | (2 << 4)); // 0001 0005 0009 000d 000h 000l 000p 000t
tB = _mm256_permute2x128_si256(t5, t7, 1 | (3 << 4)); // 0002 0006 000a 000e 000i 000m 000q 000u
tC = _mm256_permute2x128_si256(t6, t8, 0 | (2 << 4)); // 0003 0007 000b 000f 000j 000n 000r 000v
tD = _mm256_permute2x128_si256(t6, t8, 1 | (3 << 4)); // 0004 0008 000c 000g 000k 000o 000s 000w

sampleTexels_A_blue = _mm256_cvtepi32_ps(tA);
sampleTexels_B_blue = _mm256_cvtepi32_ps(tB);
sampleTexels_C_blue = _mm256_cvtepi32_ps(tC);
sampleTexels_D_blue = _mm256_cvtepi32_ps(tD);
Thanks for the help mmozeiko. This is my first time doing any serious optimizations like this so any help is appreciated. Though after taking a step back I realized that I'm still not processing any more pixels per iteration this way. If I compare my old way and new way of processing pixels, the only things that have changed are the lookups for the sampleTexels for bi-linear blending and the way I'm unpacking my coloring to blend (upacking/packing vs bitwise shifts):

Old way:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
__m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
for (i32 index {}; index < 8; ++index)
{
    BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
    BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");

    ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
    sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
    sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
    sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
    sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
};

 __m256i maskFF = _mm256_set1_epi32(0xFF);
 __m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
 __m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
 __m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
 __m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
 __m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
 __m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
 __m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
 __m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
 __m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
 __m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
 __m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
 __m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
 __m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
 __m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
 __m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
 __m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
 __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
 __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
 __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
 __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
 __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));

........

__m256 newBlendedTexel_r = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_r), _mm256_mul_ps(coefficient2, texelB_r)),
                           _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_r), _mm256_mul_ps(coefficient4, texelD_r)));
__m256 newBlendedTexel_g = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_g), _mm256_mul_ps(coefficient2, texelB_g)),
                        
........


New way:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
__m256i sampleTexels_blue {}, sampleTexels_green {}, sampleTexels_red {}, sampleTexels_alpha {};
for (i32 index {}; index < 8; ++index)
{
    BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
    BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoordsSet2_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");
                
    ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
    sampleTexels_blue.m256i_u32[index] = texelPtr;
    sampleTexels_green.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
    sampleTexels_red.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32) * 2);
    sampleTexels_alpha.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32) * 3);
};

__m256i zeros = _mm256_setzero_si256();
__m256i b0 = _mm256_unpacklo_epi8(zeros, sampleTexels_blue);
__m256i b1 = _mm256_unpackhi_epi8(zeros, sampleTexels_blue);

__m256i b00 = _mm256_unpacklo_epi16(zeros, b0);
__m256i b01 = _mm256_unpackhi_epi16(zeros, b0);
__m256i b10 = _mm256_unpacklo_epi16(zeros, b1);
__m256i b11 = _mm256_unpackhi_epi16(zeros, b1);
            
__m256i t0 = _mm256_unpacklo_epi32(b00, b01);
__m256i t1 = _mm256_unpackhi_epi32(b00, b01);
__m256i t2 = _mm256_unpacklo_epi32(b10, b01);
__m256i t3 = _mm256_unpackhi_epi32(b10, b01);
            
__m256i t4 = _mm256_unpacklo_epi32(t0, t1);
__m256i t5 = _mm256_unpackhi_epi32(t0, t1);
__m256i t6 = _mm256_unpacklo_epi32(t2, t3);
__m256i t7 = _mm256_unpackhi_epi32(t2, t3);
            
__m256 tA = _mm256_permute2x128_si256(t4, t6, 0 | (2 << 4));
__m256 tB = _mm256_permute2x128_si256(t4, t6, 1 | (3 << 4));
__m256 tC = _mm256_permute2x128_si256(t5, t7, 0 | (2 << 4));
__m256 tD = _mm256_permute2x128_si256(t5, t7, 1 | (3 << 4));
           
__m256 sampleTexelAs_blue = _mm256_cvtepi32_ps(tA);
__m256 sampleTexelBs_blue = _mm256_cvtepi32_ps(tB);
__m256 sampleTexelCs_blue = _mm256_cvtepi32_ps(tC);
__m256 sampleTexelDs_blue = _mm256_cvtepi32_ps(tD);

.......

__m256 sampleTexelAs_green = _mm256_cvtepi32_ps(tA);
__m256 sampleTexelBs_green = _mm256_cvtepi32_ps(tB);
__m256 sampleTexelCs_green = _mm256_cvtepi32_ps(tC);
__m256 sampleTexelDs_green = _mm256_cvtepi32_ps(tD);

.......

__m256 newBlendedTexel_b = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, sampleTexelAs_blue), _mm256_mul_ps(coefficient2, sampleTexelBs_blue)),
                                         _add_ps(_mm256_mul_ps(coefficient3, sampleTexelBs_blue), _mm256_mul_ps(coefficient4, sampleTexelDs_blue)));

__m256 newBlendedTexel_g = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(coefficient1, sampleTexelAs_green), _mm256_mul_ps(coefficient2, sampleTexelBs_green)),
                                         _add_ps(_mm256_mul_ps(coefficient3, sampleTexelDs_green), _mm256_mul_ps(coefficient4, sampleTexelDs_green)));

.............


I think this occurs because I'm always going to have to convert to floats eventually for the bi-linear blending (_mm256_mul_ps(coefficient, texelColor)). I might still have some slight gains in speed from this method but will have to test it. So I just wanted to double check to see if I I'm thinking of this correctly or if I'm missing something.
You could try doing it in 16-bit int operations. That will require less unpacking, and all blend operations will still work in 16-bit ints.
You should also try replacing your for loop at beginning with _mm256_i32gather_epi32 instruction. It won't be faster, but it will generate less machine code.