Register
handmade.network»Forums»Further optmizing software renderer
Jason
135 posts
Further optmizing software renderer
1 month ago Edited by Jason on Dec. 14, 2019, 6:41 p.m. Reason: Initial post
So I recently went through the optimization of the software renderer, finishing the multi-threaded portion. It seems Casey moves on to other thing from here. I was just wondering what kind of things I should do moving forward to continue to optimize the software renderer? I think Casey was discussing maybe some ways to avoid packing and unpacking pixels in the way we are currently rendering things by supplying pixel data in a more friendly SIMD format (RRRR GGGG BBBB instead of BRGA, etc.), though I'm not really sure how that would help (I have tried thinking of some ways to do this an havent come up with any good solutions yet). My current software renderering routine currently looks like this:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#include <immintrin.h>
void DrawTexture_Optimized(ui32* colorBufferData, v2i colorBufferSize, i32 colorBufferPitch, Quadf targetRect_screenCoords, RenderEntry_Texture image, Rectf clipRect)
{
    v2f origin = targetRect_screenCoords.bottomLeft;
    v2f targetRectXAxis = targetRect_screenCoords.bottomRight - targetRect_screenCoords.bottomLeft;
    v2f targetRectYAxis = targetRect_screenCoords.topLeft - targetRect_screenCoords.bottomLeft;

    i32 widthMax = (i32)clipRect.max.x;
    i32 heightMax = (i32)clipRect.max.y;

    i32 xMin = widthMax;
    i32 xMax = (i32)clipRect.min.x;
    i32 yMin = heightMax;
    i32 yMax = (i32)clipRect.min.y;

    { //Optimization to avoid iterating over every pixel on the screen - HH ep 92
        Array<v2f, 4> vecs = { origin, origin + targetRectXAxis, origin + targetRectXAxis + targetRectYAxis, origin + targetRectYAxis };
        for (i32 vecIndex = 0; vecIndex < vecs.Size(); ++vecIndex)
        {
            v2f testVec = vecs.At(vecIndex);
            i32 flooredX = FloorF32ToI32(testVec.x);
            i32 ceiledX = CeilF32ToI32(testVec.x);
            i32 flooredY = FloorF32ToI32(testVec.y);
            i32 ceiledY = CeilF32ToI32(testVec.y);

            if (xMin > flooredX)
                xMin = flooredX;
            if (yMin > flooredY)
                yMin = flooredY;
            if (xMax < ceiledX)
                xMax = ceiledX;
            if (yMax < ceiledY)
                yMax = ceiledY;
        }

        if (xMin < (i32)clipRect.min.x)
            xMin = (i32)clipRect.min.x;
        if (yMin < clipRect.min.y)
            yMin = (i32)clipRect.min.y;
        if (xMax > widthMax)
            xMax = widthMax;
        if (yMax > heightMax)
            yMax = heightMax;
    };

    i32 simdWidth_inBytes = 8;

    //Align to 8-byte boundry
    if ((xMin % simdWidth_inBytes) != 0)
        xMin = (i32)RoundDown((sizet)xMin, simdWidth_inBytes);

    //Pre calcuations for optimization
    f32 invertedXAxisSqd = 1.0f / MagnitudeSqd(targetRectXAxis);
    f32 invertedYAxisSqd = 1.0f / MagnitudeSqd(targetRectYAxis);
    i32 imageWidth = image.size.width - 3;
    i32 imageHeight = image.size.height - 3;
    v2f normalizedXAxis = invertedXAxisSqd * targetRectXAxis;
    v2f normalizedYAxis = invertedYAxisSqd * targetRectYAxis;

    i32 sizeOfPixel_inBytes = 4;
    ui8* currentRow = (ui8*)colorBufferData + (i32)xMin * sizeOfPixel_inBytes + (i32)yMin * colorBufferPitch;
    for (i32 screenY = yMin; screenY < yMax; ++screenY)
    {
        ui32* destPixel = (ui32*)currentRow;
        for (i32 screenX = xMin; screenX < xMax; screenX += simdWidth_inBytes)
        {
            //Initial setup variables for SIMD code
            __m256 one = _mm256_set1_ps(1.0f);
            __m256 zero = _mm256_set1_ps(0.0f);
            __m256 imgWidth = _mm256_set1_ps((f32)imageWidth);
            __m256 imgHeight = _mm256_set1_ps((f32)imageHeight);
            __m256 normalizedXAxis_x = _mm256_set1_ps(normalizedXAxis.x);
            __m256 normalizedXAxis_y = _mm256_set1_ps(normalizedXAxis.y);
            __m256 normalizedYAxis_x = _mm256_set1_ps(normalizedYAxis.x);
            __m256 normalizedYAxis_y = _mm256_set1_ps(normalizedYAxis.y);
            __m256 targetRectOrigin_x = _mm256_set1_ps(origin.x);
            __m256 targetRectOrigin_y = _mm256_set1_ps(origin.y);

            __m256 screenPixelCoords_x = _mm256_set_ps((f32)(screenX + 7), (f32)(screenX + 6), (f32)(screenX + 5), (f32)(screenX + 4), (f32)(screenX + 3), (f32)(screenX + 2), (f32)(screenX + 1), (f32)(screenX + 0));
            __m256 screenPixelCoords_y = _mm256_set1_ps((f32)screenY);

            __m256 uvRangeForTexture_u = _mm256_set1_ps(image.uvBounds.At(1).u - image.uvBounds.At(0).u);
            __m256 uvRangeForTexture_v = _mm256_set1_ps(image.uvBounds.At(1).v - image.uvBounds.At(0).v);

            __m256 minUVBounds_u = _mm256_set1_ps(image.uvBounds.At(0).u);
            __m256 minUVBounds_v = _mm256_set1_ps(image.uvBounds.At(0).v);

            //Gather normalized coordinates (uv's) in order to find the correct texel position below
            __m256 dXs = _mm256_sub_ps(screenPixelCoords_x, targetRectOrigin_x);
            __m256 dYs = _mm256_sub_ps(screenPixelCoords_y, targetRectOrigin_y);
            __m256 Us = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedXAxis_x), _mm256_mul_ps(dYs, normalizedXAxis_y));
            __m256 Vs = _mm256_add_ps(_mm256_mul_ps(dXs, normalizedYAxis_x), _mm256_mul_ps(dYs, normalizedYAxis_y));

            /* clang-format off */
            //Using a mask to determine what colors final 8 wide pixel destintion buffer should except
            //(background texels or image texels). This replaces the need for a conditional
            __m256i writeMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(Us, zero, _CMP_GE_OQ),
                                                                  _mm256_cmp_ps(Us, one, _CMP_LE_OQ)),
                                                                  _mm256_and_ps(_mm256_cmp_ps(Vs, zero, _CMP_GE_OQ),
                                                                  _mm256_cmp_ps(Vs, one, _CMP_LE_OQ))));
            /* clang-format on */

            __m256i clipMask = _mm256_set1_epi32(0xFFFFFFFF);

            //See how much final 8 pixel wide dest buffer will expand past the max boundry of screen region (if at all)
            //and adjust it
            if (screenX > ((i32)widthMax - simdWidth_inBytes))
            {
                i32 diff = (i32)widthMax - (i32)screenX;
                i32 amountOfScreenOverflow = simdWidth_inBytes - diff;

                i32 index { 7 };
                while (amountOfScreenOverflow)
                {
                    clipMask.m256i_u32[index] = 0;
                    index -= 1;
                    --amountOfScreenOverflow;
                };
            };

            //Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            __m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
            for (i32 index {}; index < 8; ++index)
            {
                BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
                BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");

                //Gather 4 texels (in a square pattern) from certain texel Ptr
                ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
                sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
                sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
                sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
                sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
            };

#if __AVX2__
            //Unpack 4 sample texels to prepare for bilinear blend
            __m256i maskFF = _mm256_set1_epi32(0xFF);
            __m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
            __m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
            __m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
            __m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
            __m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
            __m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
            __m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
            __m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
            __m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
            __m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
            __m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
            __m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
            __m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
            __m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
            __m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
            __m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));

#elif __AVX__
            //Unpack 4 sample texels to prepare for bilinear blend
            __m256i* ptrToSampleTexelAs = &sampleTexelAs;
            __m256 texelA_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 28), (f32) * ((ui8*)ptrToSampleTexelAs + 24), (f32) * ((ui8*)ptrToSampleTexelAs + 20), (f32) * ((ui8*)ptrToSampleTexelAs + 16), (f32) * ((ui8*)ptrToSampleTexelAs + 12), (f32) * ((ui8*)ptrToSampleTexelAs + 8), (f32) * ((ui8*)ptrToSampleTexelAs + 4), (f32) * ((ui8*)ptrToSampleTexelAs + 0));
            __m256 texelA_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 29), (f32) * ((ui8*)ptrToSampleTexelAs + 25), (f32) * ((ui8*)ptrToSampleTexelAs + 21), (f32) * ((ui8*)ptrToSampleTexelAs + 17), (f32) * ((ui8*)ptrToSampleTexelAs + 13), (f32) * ((ui8*)ptrToSampleTexelAs + 9), (f32) * ((ui8*)ptrToSampleTexelAs + 5), (f32) * ((ui8*)ptrToSampleTexelAs + 1));
            __m256 texelA_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 30), (f32) * ((ui8*)ptrToSampleTexelAs + 26), (f32) * ((ui8*)ptrToSampleTexelAs + 22), (f32) * ((ui8*)ptrToSampleTexelAs + 18), (f32) * ((ui8*)ptrToSampleTexelAs + 14), (f32) * ((ui8*)ptrToSampleTexelAs + 10), (f32) * ((ui8*)ptrToSampleTexelAs + 6), (f32) * ((ui8*)ptrToSampleTexelAs + 2));
            __m256 texelA_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelAs + 31), (f32) * ((ui8*)ptrToSampleTexelAs + 27), (f32) * ((ui8*)ptrToSampleTexelAs + 23), (f32) * ((ui8*)ptrToSampleTexelAs + 19), (f32) * ((ui8*)ptrToSampleTexelAs + 15), (f32) * ((ui8*)ptrToSampleTexelAs + 11), (f32) * ((ui8*)ptrToSampleTexelAs + 7), (f32) * ((ui8*)ptrToSampleTexelAs + 3));

            __m256i* ptrToSampleTexelBs = &sampleTexelBs;
            __m256 texelB_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 28), (f32) * ((ui8*)ptrToSampleTexelBs + 24), (f32) * ((ui8*)ptrToSampleTexelBs + 20), (f32) * ((ui8*)ptrToSampleTexelBs + 16), (f32) * ((ui8*)ptrToSampleTexelBs + 12), (f32) * ((ui8*)ptrToSampleTexelBs + 8), (f32) * ((ui8*)ptrToSampleTexelBs + 4), (f32) * ((ui8*)ptrToSampleTexelBs + 0));
            __m256 texelB_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 29), (f32) * ((ui8*)ptrToSampleTexelBs + 25), (f32) * ((ui8*)ptrToSampleTexelBs + 21), (f32) * ((ui8*)ptrToSampleTexelBs + 17), (f32) * ((ui8*)ptrToSampleTexelBs + 13), (f32) * ((ui8*)ptrToSampleTexelBs + 9), (f32) * ((ui8*)ptrToSampleTexelBs + 5), (f32) * ((ui8*)ptrToSampleTexelBs + 1));
            __m256 texelB_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 30), (f32) * ((ui8*)ptrToSampleTexelBs + 26), (f32) * ((ui8*)ptrToSampleTexelBs + 22), (f32) * ((ui8*)ptrToSampleTexelBs + 18), (f32) * ((ui8*)ptrToSampleTexelBs + 14), (f32) * ((ui8*)ptrToSampleTexelBs + 10), (f32) * ((ui8*)ptrToSampleTexelBs + 6), (f32) * ((ui8*)ptrToSampleTexelBs + 2));
            __m256 texelB_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelBs + 31), (f32) * ((ui8*)ptrToSampleTexelBs + 27), (f32) * ((ui8*)ptrToSampleTexelBs + 23), (f32) * ((ui8*)ptrToSampleTexelBs + 19), (f32) * ((ui8*)ptrToSampleTexelBs + 15), (f32) * ((ui8*)ptrToSampleTexelBs + 11), (f32) * ((ui8*)ptrToSampleTexelBs + 7), (f32) * ((ui8*)ptrToSampleTexelBs + 3));

            __m256i* ptrToSampleTexelCs = &sampleTexelCs;
            __m256 texelC_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 28), (f32) * ((ui8*)ptrToSampleTexelCs + 24), (f32) * ((ui8*)ptrToSampleTexelCs + 20), (f32) * ((ui8*)ptrToSampleTexelCs + 16), (f32) * ((ui8*)ptrToSampleTexelCs + 12), (f32) * ((ui8*)ptrToSampleTexelCs + 8), (f32) * ((ui8*)ptrToSampleTexelCs + 4), (f32) * ((ui8*)ptrToSampleTexelCs + 0));
            __m256 texelC_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 29), (f32) * ((ui8*)ptrToSampleTexelCs + 25), (f32) * ((ui8*)ptrToSampleTexelCs + 21), (f32) * ((ui8*)ptrToSampleTexelCs + 17), (f32) * ((ui8*)ptrToSampleTexelCs + 13), (f32) * ((ui8*)ptrToSampleTexelCs + 9), (f32) * ((ui8*)ptrToSampleTexelCs + 5), (f32) * ((ui8*)ptrToSampleTexelCs + 1));
            __m256 texelC_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 30), (f32) * ((ui8*)ptrToSampleTexelCs + 26), (f32) * ((ui8*)ptrToSampleTexelCs + 22), (f32) * ((ui8*)ptrToSampleTexelCs + 18), (f32) * ((ui8*)ptrToSampleTexelCs + 14), (f32) * ((ui8*)ptrToSampleTexelCs + 10), (f32) * ((ui8*)ptrToSampleTexelCs + 6), (f32) * ((ui8*)ptrToSampleTexelCs + 2));
            __m256 texelC_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelCs + 31), (f32) * ((ui8*)ptrToSampleTexelCs + 27), (f32) * ((ui8*)ptrToSampleTexelCs + 23), (f32) * ((ui8*)ptrToSampleTexelCs + 19), (f32) * ((ui8*)ptrToSampleTexelCs + 15), (f32) * ((ui8*)ptrToSampleTexelCs + 11), (f32) * ((ui8*)ptrToSampleTexelCs + 7), (f32) * ((ui8*)ptrToSampleTexelCs + 3));

            __m256i* ptrToSampleTexelDs = &sampleTexelDs;
            __m256 texelD_b = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 28), (f32) * ((ui8*)ptrToSampleTexelDs + 24), (f32) * ((ui8*)ptrToSampleTexelDs + 20), (f32) * ((ui8*)ptrToSampleTexelDs + 16), (f32) * ((ui8*)ptrToSampleTexelDs + 12), (f32) * ((ui8*)ptrToSampleTexelDs + 8), (f32) * ((ui8*)ptrToSampleTexelDs + 4), (f32) * ((ui8*)ptrToSampleTexelDs + 0));
            __m256 texelD_g = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 29), (f32) * ((ui8*)ptrToSampleTexelDs + 25), (f32) * ((ui8*)ptrToSampleTexelDs + 21), (f32) * ((ui8*)ptrToSampleTexelDs + 17), (f32) * ((ui8*)ptrToSampleTexelDs + 13), (f32) * ((ui8*)ptrToSampleTexelDs + 9), (f32) * ((ui8*)ptrToSampleTexelDs + 5), (f32) * ((ui8*)ptrToSampleTexelDs + 1));
            __m256 texelD_r = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 30), (f32) * ((ui8*)ptrToSampleTexelDs + 26), (f32) * ((ui8*)ptrToSampleTexelDs + 22), (f32) * ((ui8*)ptrToSampleTexelDs + 18), (f32) * ((ui8*)ptrToSampleTexelDs + 14), (f32) * ((ui8*)ptrToSampleTexelDs + 10), (f32) * ((ui8*)ptrToSampleTexelDs + 6), (f32) * ((ui8*)ptrToSampleTexelDs + 2));
            __m256 texelD_a = _mm256_set_ps((f32) * ((ui8*)ptrToSampleTexelDs + 31), (f32) * ((ui8*)ptrToSampleTexelDs + 27), (f32) * ((ui8*)ptrToSampleTexelDs + 23), (f32) * ((ui8*)ptrToSampleTexelDs + 19), (f32) * ((ui8*)ptrToSampleTexelDs + 15), (f32) * ((ui8*)ptrToSampleTexelDs + 11), (f32) * ((ui8*)ptrToSampleTexelDs + 7), (f32) * ((ui8*)ptrToSampleTexelDs + 3));

            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256i* ptrToBackgroundPixels = &backGroundPixels;
            __m256 backgroundColors_b = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 28), (f32) * ((ui8*)ptrToBackgroundPixels + 24), (f32) * ((ui8*)ptrToBackgroundPixels + 20), (f32) * ((ui8*)ptrToBackgroundPixels + 16), (f32) * ((ui8*)ptrToBackgroundPixels + 12), (f32) * ((ui8*)ptrToBackgroundPixels + 8), (f32) * ((ui8*)ptrToBackgroundPixels + 4), (f32) * ((ui8*)ptrToBackgroundPixels + 0));
            __m256 backgroundColors_g = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 29), (f32) * ((ui8*)ptrToBackgroundPixels + 25), (f32) * ((ui8*)ptrToBackgroundPixels + 21), (f32) * ((ui8*)ptrToBackgroundPixels + 17), (f32) * ((ui8*)ptrToBackgroundPixels + 13), (f32) * ((ui8*)ptrToBackgroundPixels + 9), (f32) * ((ui8*)ptrToBackgroundPixels + 5), (f32) * ((ui8*)ptrToBackgroundPixels + 1));
            __m256 backgroundColors_r = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 30), (f32) * ((ui8*)ptrToBackgroundPixels + 26), (f32) * ((ui8*)ptrToBackgroundPixels + 22), (f32) * ((ui8*)ptrToBackgroundPixels + 18), (f32) * ((ui8*)ptrToBackgroundPixels + 14), (f32) * ((ui8*)ptrToBackgroundPixels + 10), (f32) * ((ui8*)ptrToBackgroundPixels + 6), (f32) * ((ui8*)ptrToBackgroundPixels + 2));
            __m256 backgroundColors_a = _mm256_set_ps((f32) * ((ui8*)ptrToBackgroundPixels + 31), (f32) * ((ui8*)ptrToBackgroundPixels + 27), (f32) * ((ui8*)ptrToBackgroundPixels + 23), (f32) * ((ui8*)ptrToBackgroundPixels + 19), (f32) * ((ui8*)ptrToBackgroundPixels + 15), (f32) * ((ui8*)ptrToBackgroundPixels + 11), (f32) * ((ui8*)ptrToBackgroundPixels + 7), (f32) * ((ui8*)ptrToBackgroundPixels + 3));
#endif

            //Bilinear blend
            __m256 percentToLerpInX = _mm256_sub_ps(texelCoords_x, _mm256_floor_ps(texelCoords_x));
            __m256 percentToLerpInY = _mm256_sub_ps(texelCoords_y, _mm256_floor_ps(texelCoords_y));
            __m256 oneMinusXLerp = _mm256_sub_ps(one, percentToLerpInX);
            __m256 oneMinusYLerp = _mm256_sub_ps(one, percentToLerpInY);
            __m256 coefficient1 = _mm256_mul_ps(oneMinusYLerp, oneMinusXLerp);
            __m256 coefficient2 = _mm256_mul_ps(oneMinusYLerp, percentToLerpInX);
            __m256 coefficient3 = _mm256_mul_ps(percentToLerpInY, oneMinusXLerp);
            __m256 coefficient4 = _mm256_mul_ps(percentToLerpInY, percentToLerpInX);
            __m256 newBlendedTexel_r = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_r), _mm256_mul_ps(coefficient2, texelB_r)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_r), _mm256_mul_ps(coefficient4, texelD_r)));
            __m256 newBlendedTexel_g = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_g), _mm256_mul_ps(coefficient2, texelB_g)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_g), _mm256_mul_ps(coefficient4, texelD_g)));
            __m256 newBlendedTexel_b = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_b), _mm256_mul_ps(coefficient2, texelB_b)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_b), _mm256_mul_ps(coefficient4, texelD_b)));
            __m256 newBlendedTexel_a = _mm256_add_ps(
                _mm256_add_ps(_mm256_mul_ps(coefficient1, texelA_a), _mm256_mul_ps(coefficient2, texelB_a)),
                _mm256_add_ps(_mm256_mul_ps(coefficient3, texelC_a), _mm256_mul_ps(coefficient4, texelD_a)));

            //Linear blend (w/ pre multiplied alpha)
            __m256 maxColorValue = _mm256_set1_ps(255.0f);
            __m256 alphaBlend = _mm256_div_ps(newBlendedTexel_a, maxColorValue);
            __m256 oneMinusAlphaBlend = _mm256_sub_ps(one, alphaBlend);
            __m256 finalBlendedColor_r = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_r), newBlendedTexel_r);
            __m256 finalBlendedColor_g = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_g), newBlendedTexel_g);
            __m256 finalBlendedColor_b = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_b), newBlendedTexel_b);
            __m256 finalBlendedColor_a = _mm256_add_ps(_mm256_mul_ps(oneMinusAlphaBlend, backgroundColors_a), newBlendedTexel_a);

#if __AVX2__
            { //Convert and Pack into dest pixels to write out
                __m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
                __m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
                __m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
                __m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);

                //Move pix*els (through bitwise operations and shifting) from RRRR GGGG etc. format to expected BGRA format
                __m256i out = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_slli_epi32(finalBlendedColori_r, 16), _mm256_slli_epi32(finalBlendedColori_g, 8)), finalBlendedColori_b), _mm256_slli_epi32(finalBlendedColori_a, 24));

                //Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
                //the background color)
                __m256i maskedOut = _mm256_or_si256(_mm256_and_si256(writeMask, out),
                    _mm256_andnot_si256(writeMask, backGroundPixels));

                maskedOut = _mm256_or_si256(_mm256_and_si256(clipMask, maskedOut),
                    _mm256_andnot_si256(clipMask, *(__m256i*)destPixel));

                *(__m256i*)destPixel = maskedOut;
            };

#elif __AVX__
            { //Convert and Pack into dest pixels to write out
                __m256i finalBlendedColori_r = _mm256_cvtps_epi32(finalBlendedColor_r);
                __m256i finalBlendedColori_g = _mm256_cvtps_epi32(finalBlendedColor_g);
                __m256i finalBlendedColori_b = _mm256_cvtps_epi32(finalBlendedColor_b);
                __m256i finalBlendedColori_a = _mm256_cvtps_epi32(finalBlendedColor_a);

                __m256i backgroundColorsi_r = _mm256_cvtps_epi32(backgroundColors_r);
                __m256i backgroundColorsi_g = _mm256_cvtps_epi32(backgroundColors_g);
                __m256i backgroundColorsi_b = _mm256_cvtps_epi32(backgroundColors_b);
                __m256i backgroundColorsi_a = _mm256_cvtps_epi32(backgroundColors_a);

                //Since AVX doesn't have certain bitwise operations I need to extract 128 bit values from
                //256 bit ones and then use the available bitwise operations on those
                __m128i pixelSet1_r = _mm256_extractf128_si256(finalBlendedColori_r, 0);
                __m128i pixelSet2_r = _mm256_extractf128_si256(finalBlendedColori_r, 1);
                __m128i pixelSet1_g = _mm256_extractf128_si256(finalBlendedColori_g, 0);
                __m128i pixelSet2_g = _mm256_extractf128_si256(finalBlendedColori_g, 1);
                __m128i pixelSet1_b = _mm256_extractf128_si256(finalBlendedColori_b, 0);
                __m128i pixelSet2_b = _mm256_extractf128_si256(finalBlendedColori_b, 1);
                __m128i pixelSet1_a = _mm256_extractf128_si256(finalBlendedColori_a, 0);
                __m128i pixelSet2_a = _mm256_extractf128_si256(finalBlendedColori_a, 1);
                __m128i backgroundPixelSet1_r = _mm256_extractf128_si256(backgroundColorsi_r, 0);
                __m128i backgroundPixelSet2_r = _mm256_extractf128_si256(backgroundColorsi_r, 1);
                __m128i backgroundPixelSet1_g = _mm256_extractf128_si256(backgroundColorsi_g, 0);
                __m128i backgroundPixelSet2_g = _mm256_extractf128_si256(backgroundColorsi_g, 1);
                __m128i backgroundPixelSet1_b = _mm256_extractf128_si256(backgroundColorsi_b, 0);
                __m128i backgroundPixelSet2_b = _mm256_extractf128_si256(backgroundColorsi_b, 1);
                __m128i backgroundPixelSet1_a = _mm256_extractf128_si256(backgroundColorsi_a, 0);
                __m128i backgroundPixelSet2_a = _mm256_extractf128_si256(backgroundColorsi_a, 1);
                __m128i writeMaskSet1 = _mm256_extractf128_si256(writeMask, 0);
                __m128i writeMaskSet2 = _mm256_extractf128_si256(writeMask, 1);
                __m128i clipMaskSet1 = _mm256_extractf128_si256(clipMask, 0);
                __m128i clipMaskSet2 = _mm256_extractf128_si256(clipMask, 1);

                //Move pixels (through bitwise operations and shifting) from RRRR GGGG ... format to expected BGRA format
                __m128i pixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet1_r, 16), _mm_slli_epi32(pixelSet1_g, 8)), pixelSet1_b), _mm_slli_epi32(pixelSet1_a, 24));
                __m128i pixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(pixelSet2_r, 16), _mm_slli_epi32(pixelSet2_g, 8)), pixelSet2_b), _mm_slli_epi32(pixelSet2_a, 24));
                __m128i backgroundPixels1Through4 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet1_r, 16), _mm_slli_epi32(backgroundPixelSet1_g, 8)), backgroundPixelSet1_b), _mm_slli_epi32(backgroundPixelSet1_a, 24));
                __m128i backgroundPixels5Through8 = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_slli_epi32(backgroundPixelSet2_r, 16), _mm_slli_epi32(backgroundPixelSet2_g, 8)), backgroundPixelSet2_b), _mm_slli_epi32(backgroundPixelSet2_a, 24));

                //Use write mask in order to correctly fill 8 wide pixel lane (properly writing either the texel color or
                //the background color)
                __m128i maskedOutSet1 = _mm_or_si128(_mm_and_si128(writeMaskSet1, pixels1Through4),
                    _mm_andnot_si128(writeMaskSet1, backgroundPixels1Through4));

                __m128i maskedOutSet2 = _mm_or_si128(_mm_and_si128(writeMaskSet2, pixels5Through8),
                    _mm_andnot_si128(writeMaskSet2, backgroundPixels5Through8));

                maskedOutSet1 = _mm_or_si128(_mm_and_si128(clipMaskSet1, maskedOutSet1),
                    _mm_andnot_si128(clipMaskSet1, *(__m128i*)destPixel));

                maskedOutSet2 = _mm_or_si128(_mm_and_si128(clipMaskSet2, maskedOutSet2),
                    _mm_andnot_si128(clipMaskSet2, _mm256_extractf128_si256(*(__m256i*)destPixel, 1)));

                //Pack 128 bit pixel values back into 256 bit values to write out
                __m256i maskedOut = _mm256_castsi128_si256(maskedOutSet1);
                maskedOut = _mm256_insertf128_si256(maskedOut, maskedOutSet2, 1);

                *(__m256i*)destPixel = maskedOut;
            };

#endif

            destPixel += 8;
        };

        currentRow += colorBufferPitch;
    };
};
Mārtiņš Možeiko
1998 posts / 1 project
Further optmizing software renderer
1 month ago
boagz57
though I'm not really sure how that would help

It will help avoid doing all those shifts & and's to extract r/g/b values and also avoid doing shifts & or's to pack for storing.
With one load you'll load all reds (or greens/blues). All you need to do is expand byte to float. Or put it back for writing to memory.

Other optimization possibility is to avoid float's. Do everything in 8 bits or 16 bits integer in intermediate result (using extra registers). That should allow you to process 256/8 = 32 pixels in one iteration, not just 8.

Another common approach in CPU rasterizer (and even in GPU implementation) is to store memory in tiled order. Instead of storing image as line-by line, store them as 4x4 chunks (or 8x8, or 16x8, need to try and measure what's better). This way loading pixels for bilinear blending will give better cache locality.

Find some articles about how Larabee was doing rasterization, it may have some good ideas to apply to your case.
Jason
135 posts
Further optmizing software renderer
1 month ago
Awesome, thanks for those suggestions. I am going to try and dive deep into this problem and see what I can do. I will probably post back in this thread at some point if I have another question or I might just show current progress.
Jason
135 posts
Further optmizing software renderer
4 weeks, 1 day ago
So I've been working with my current software renderer and I'm trying to see if I can optimize things using the first method suggest by Martins where I send pixel colors to renderer in the format BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA (assuming 8-wide simd instructions) instead of BGRA BRGA BGRA etc. I have done a preliminary pass and have been able to get things to run again with above format but with incorrect colors outputting to the screen.

Original botteneck'd code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
//Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            __m256i sampleTexelAs {}, sampleTexelBs {}, sampleTexelCs {}, sampleTexelDs {};
            for (i32 index {}; index < 8; ++index)
            {
                BGZ_ASSERT((texelCoords_x.m256_f32[index] >= 0) && (texelCoords_x.m256_f32[index] <= (i32)image.size.width), "x coord is out of range!: ");
                BGZ_ASSERT((texelCoords_y.m256_f32[index] >= 0) && (texelCoords_y.m256_f32[index] <= (i32)image.size.height), "y coord is out of range!");

                //Gather 4 texels (in a square pattern) from certain texel Ptr
                ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[index] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[index] * sizeof(ui32)); //size of pixel
                sampleTexelAs.m256i_u32[index] = *(ui32*)(texelPtr);
                sampleTexelBs.m256i_u32[index] = *(ui32*)(texelPtr + sizeof(ui32));
                sampleTexelCs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls);
                sampleTexelDs.m256i_u32[index] = *(ui32*)(texelPtr + image.pitch_pxls + sizeof(ui32));
            };

#if __AVX2__
            //Unpack 4 sample texels to prepare for bilinear blend. Have to do a bunch of bit shifting/masking here
            __m256i maskFF = _mm256_set1_epi32(0xFF);
            __m256 texelA_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelAs, maskFF));
            __m256 texelA_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 8), maskFF));
            __m256 texelA_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 16), maskFF));
            __m256 texelA_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelAs, 24), maskFF));
            __m256 texelB_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelBs, maskFF));
            __m256 texelB_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 8), maskFF));
            __m256 texelB_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 16), maskFF));
            __m256 texelB_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelBs, 24), maskFF));
            __m256 texelC_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelCs, maskFF));
            __m256 texelC_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 8), maskFF));
            __m256 texelC_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 16), maskFF));
            __m256 texelC_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelCs, 24), maskFF));
            __m256 texelD_b = _mm256_cvtepi32_ps(_mm256_and_si256(sampleTexelDs, maskFF));
            __m256 texelD_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 8), maskFF));
            __m256 texelD_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 16), maskFF));
            __m256 texelD_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(sampleTexelDs, 24), maskFF));
            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));


New, optimized code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
//Clamp UVs to prevent accessing memory that is invalid
            Us = _mm256_min_ps(_mm256_max_ps(Us, zero), one);
            Vs = _mm256_min_ps(_mm256_max_ps(Vs, zero), one);

            __m256 textureUs = _mm256_add_ps(minUVBounds_u, _mm256_mul_ps(uvRangeForTexture_u, Us));
            __m256 textureVs = _mm256_add_ps(minUVBounds_v, _mm256_mul_ps(uvRangeForTexture_v, Vs));

            __m256 texelCoords_x = _mm256_mul_ps(textureUs, imgWidth);
            __m256 texelCoords_y = _mm256_mul_ps(textureVs, imgHeight);

            BGZ_ASSERT((texelCoords_x.m256_f32[0] >= 0) && (texelCoords_x.m256_f32[0] <= (i32)image.size.width), "x coord is out of range!: ");
            BGZ_ASSERT((texelCoords_y.m256_f32[0] >= 0) && (texelCoords_y.m256_f32[0] <= (i32)image.size.height), "y coord is out of range!");

            //Gather 4 texels (in a square pattern) from certain texel Ptr
            ui8* texelPtr = ((ui8*)image.colorData) + ((ui32)texelCoords_y.m256_f32[0] * image.pitch_pxls) + ((ui32)texelCoords_x.m256_f32[0] * sizeof(ui32)); //size of pixel
            __m256i* texelA = (__m256i*)(texelPtr);
            __m256i* texelB = (__m256i*)(texelPtr + sizeof(__m256i));
            __m256i* texelC = (__m256i*)(texelPtr + image.pitch_pxls);
            __m256i* texelD = (__m256i*)(texelPtr + image.pitch_pxls + sizeof(__m256i));

            __m256i sampleTexelAs_b = _mm256_set_epi32((ui32)(*((ui8*)texelA + 0)), (ui32)(*((ui8*)texelA + 1)), (ui32)(*((ui8*)texelA + 2)), (ui32)(*((ui8*)texelA + 3)), (ui32)(*((ui8*)texelA + 4)), (ui32)(*((ui8*)texelA + 5)), (ui32)(*((ui8*)texelA + 6)), (ui32)(*((ui8*)texelA + 7)));
            __m256i sampleTexelBs_b = _mm256_set_epi32((ui32)(*((ui8*)texelB + 0)), (ui32)(*((ui8*)texelB + 1)), (ui32)(*((ui8*)texelB + 2)), (ui32)(*((ui8*)texelB + 3)), (ui32)(*((ui8*)texelB + 4)), (ui32)(*((ui8*)texelB + 5)), (ui32)(*((ui8*)texelB + 6)), (ui32)(*((ui8*)texelB + 7)));
            __m256i sampleTexelCs_b = _mm256_set_epi32((ui32)(*((ui8*)texelC + 0)), (ui32)(*((ui8*)texelC + 1)), (ui32)(*((ui8*)texelC + 2)), (ui32)(*((ui8*)texelC + 3)), (ui32)(*((ui8*)texelC + 4)), (ui32)(*((ui8*)texelC + 5)), (ui32)(*((ui8*)texelC + 6)), (ui32)(*((ui8*)texelC + 7)));
            __m256i sampleTexelDs_b = _mm256_set_epi32((ui32)(*((ui8*)texelD + 0)), (ui32)(*((ui8*)texelD + 1)), (ui32)(*((ui8*)texelD + 2)), (ui32)(*((ui8*)texelD + 3)), (ui32)(*((ui8*)texelD + 4)), (ui32)(*((ui8*)texelD + 5)), (ui32)(*((ui8*)texelD + 6)), (ui32)(*((ui8*)texelD + 7)));

            __m256i sampleTexelAs_g = _mm256_set_epi32((ui32)(*((ui8*)texelA + 8)), (ui32)(*((ui8*)texelA + 9)), (ui32)(*((ui8*)texelA + 10)), (ui32)(*((ui8*)texelA + 11)), (ui32)(*((ui8*)texelA + 12)), (ui32)(*((ui8*)texelA + 13)), (ui32)(*((ui8*)texelA + 14)), (ui32)(*((ui8*)texelA + 15)));
            __m256i sampleTexelBs_g = _mm256_set_epi32((ui32)(*((ui8*)texelB + 8)), (ui32)(*((ui8*)texelB + 9)), (ui32)(*((ui8*)texelB + 10)), (ui32)(*((ui8*)texelB + 11)), (ui32)(*((ui8*)texelB + 12)), (ui32)(*((ui8*)texelB + 13)), (ui32)(*((ui8*)texelB + 14)), (ui32)(*((ui8*)texelB + 15)));
            __m256i sampleTexelCs_g = _mm256_set_epi32((ui32)(*((ui8*)texelC + 8)), (ui32)(*((ui8*)texelC + 9)), (ui32)(*((ui8*)texelC + 10)), (ui32)(*((ui8*)texelC + 11)), (ui32)(*((ui8*)texelC + 12)), (ui32)(*((ui8*)texelC + 13)), (ui32)(*((ui8*)texelC + 14)), (ui32)(*((ui8*)texelC + 15)));
            __m256i sampleTexelDs_g = _mm256_set_epi32((ui32)(*((ui8*)texelD + 8)), (ui32)(*((ui8*)texelD + 9)), (ui32)(*((ui8*)texelD + 10)), (ui32)(*((ui8*)texelD + 11)), (ui32)(*((ui8*)texelD + 12)), (ui32)(*((ui8*)texelD + 13)), (ui32)(*((ui8*)texelD + 14)), (ui32)(*((ui8*)texelD + 15)));

            __m256i sampleTexelAs_r = _mm256_set_epi32((ui32)(*((ui8*)texelA + 16)), (ui32)(*((ui8*)texelA + 17)), (ui32)(*((ui8*)texelA + 18)), (ui32)(*((ui8*)texelA + 19)), (ui32)(*((ui8*)texelA + 20)), (ui32)(*((ui8*)texelA + 21)), (ui32)(*((ui8*)texelA + 22)), (ui32)(*((ui8*)texelA + 23)));
            __m256i sampleTexelBs_r = _mm256_set_epi32((ui32)(*((ui8*)texelB + 16)), (ui32)(*((ui8*)texelB + 17)), (ui32)(*((ui8*)texelB + 18)), (ui32)(*((ui8*)texelB + 19)), (ui32)(*((ui8*)texelB + 20)), (ui32)(*((ui8*)texelB + 21)), (ui32)(*((ui8*)texelB + 22)), (ui32)(*((ui8*)texelB + 23)));
            __m256i sampleTexelCs_r = _mm256_set_epi32((ui32)(*((ui8*)texelC + 16)), (ui32)(*((ui8*)texelC + 17)), (ui32)(*((ui8*)texelC + 18)), (ui32)(*((ui8*)texelC + 19)), (ui32)(*((ui8*)texelC + 20)), (ui32)(*((ui8*)texelC + 21)), (ui32)(*((ui8*)texelC + 22)), (ui32)(*((ui8*)texelC + 23)));
            __m256i sampleTexelDs_r = _mm256_set_epi32((ui32)(*((ui8*)texelD + 16)), (ui32)(*((ui8*)texelD + 17)), (ui32)(*((ui8*)texelD + 18)), (ui32)(*((ui8*)texelD + 19)), (ui32)(*((ui8*)texelD + 20)), (ui32)(*((ui8*)texelD + 21)), (ui32)(*((ui8*)texelD + 22)), (ui32)(*((ui8*)texelD + 23)));

            __m256i sampleTexelAs_a = _mm256_set_epi32((ui32)(*((ui8*)texelA + 24)), (ui32)(*((ui8*)texelA + 25)), (ui32)(*((ui8*)texelA + 26)), (ui32)(*((ui8*)texelA + 27)), (ui32)(*((ui8*)texelA + 28)), (ui32)(*((ui8*)texelA + 29)), (ui32)(*((ui8*)texelA + 30)), (ui32)(*((ui8*)texelA + 31)));
            __m256i sampleTexelBs_a = _mm256_set_epi32((ui32)(*((ui8*)texelB + 24)), (ui32)(*((ui8*)texelB + 25)), (ui32)(*((ui8*)texelB + 26)), (ui32)(*((ui8*)texelB + 27)), (ui32)(*((ui8*)texelB + 28)), (ui32)(*((ui8*)texelB + 29)), (ui32)(*((ui8*)texelB + 30)), (ui32)(*((ui8*)texelB + 31)));
            __m256i sampleTexelCs_a = _mm256_set_epi32((ui32)(*((ui8*)texelC + 24)), (ui32)(*((ui8*)texelC + 25)), (ui32)(*((ui8*)texelC + 26)), (ui32)(*((ui8*)texelC + 27)), (ui32)(*((ui8*)texelC + 28)), (ui32)(*((ui8*)texelC + 29)), (ui32)(*((ui8*)texelC + 30)), (ui32)(*((ui8*)texelC + 31)));
            __m256i sampleTexelDs_a = _mm256_set_epi32((ui32)(*((ui8*)texelD + 24)), (ui32)(*((ui8*)texelD + 25)), (ui32)(*((ui8*)texelD + 26)), (ui32)(*((ui8*)texelD + 27)), (ui32)(*((ui8*)texelD + 28)), (ui32)(*((ui8*)texelD + 29)), (ui32)(*((ui8*)texelD + 30)), (ui32)(*((ui8*)texelD + 31)));

#if __AVX2__
            __m256i maskFF = _mm256_set1_epi32(0xFF);

            //Unpack 4 sample texels to prepare for bilinear blend. NO more bit shifting/masking needed
            __m256 texelA_b = _mm256_cvtepi32_ps(sampleTexelAs_b);
            __m256 texelB_b = _mm256_cvtepi32_ps(sampleTexelBs_b);
            __m256 texelC_b = _mm256_cvtepi32_ps(sampleTexelCs_b);
            __m256 texelD_b = _mm256_cvtepi32_ps(sampleTexelDs_b);

            __m256 texelA_g = _mm256_cvtepi32_ps(sampleTexelAs_g);
            __m256 texelB_g = _mm256_cvtepi32_ps(sampleTexelBs_g);
            __m256 texelC_g = _mm256_cvtepi32_ps(sampleTexelCs_g);
            __m256 texelD_g = _mm256_cvtepi32_ps(sampleTexelDs_g);

            __m256 texelA_r = _mm256_cvtepi32_ps(sampleTexelAs_r);
            __m256 texelB_r = _mm256_cvtepi32_ps(sampleTexelBs_r);
            __m256 texelC_r = _mm256_cvtepi32_ps(sampleTexelCs_r);
            __m256 texelD_r = _mm256_cvtepi32_ps(sampleTexelDs_r);

            __m256 texelA_a = _mm256_cvtepi32_ps(sampleTexelAs_a);
            __m256 texelB_a = _mm256_cvtepi32_ps(sampleTexelBs_a);
            __m256 texelC_a = _mm256_cvtepi32_ps(sampleTexelCs_a);
            __m256 texelD_a = _mm256_cvtepi32_ps(sampleTexelDs_a);

            __m256i backGroundPixels = _mm256_load_si256((__m256i*)destPixel);
            __m256 backgroundColors_b = _mm256_cvtepi32_ps(_mm256_and_si256(backGroundPixels, maskFF));
            __m256 backgroundColors_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 8), maskFF));
            __m256 backgroundColors_r = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 16), maskFF));
            __m256 backgroundColors_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(backGroundPixels, 24), maskFF));



Things are definitely faster but of course I have to now get colors to output correctly. The issue I'm running into has to do with texture UVs. I think I could get things to work properly with a speed improvement if I assume a non-rotated image. This is because my v's of each uv calculated will be the same, meaning my texel y positions will all be the same for a given set of 8 texel x positions (assuming image pitch is a factor of 8). However, when a texture is rotated then I would have situations where the texel y positions of a row of 8 pixels will be different, meaning I would have to grab texel colors from different rows of the original image. Thus I couldn't just load in all blue's green's red's and alpha's with no hassel (I would have to go back to bitshifting/masking again to get colors properly prepared for bi-linear filtering, which is what I was doing with the original BGRA format in the first place). I was wondering if anyone had any ideas as to how to go about fixing this issue or had any suggestions on how to think about the problem.
Mārtiņš Možeiko
1998 posts / 1 project
Further optmizing software renderer
4 weeks, 1 day ago
Not sure how calculating u/v's are affected by way how you load and store color values in SIMD register. Does your code work with "old" way of storing and processing pixels? If so, it should work with "new" changed way. There should be no differences. Just make sure values calculated for u/v's are the same.

Btw instead of doing so many byte loads + _mm256_set_epi32 instructions, do one load with _mm256_lddqu_si256 to load 32 bytes. Then do shuffle/unpack operations take each 8 bytes and expand them into 4x8 32-bit values - four registers each with 8 32-bit values.
Jason
135 posts
Further optmizing software renderer
4 weeks ago Edited by Jason on Dec. 21, 2019, 5:07 p.m.
Hmm, I might be going about things incorrectly or just misunderstanding the solution. Currently, I am loading an image into the game's memory like this (I know this has some bugs in it but it's just my current solution):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
Image LoadBitmap_SoftwareRendering_BGRA(const char* fileName)
{
    Image result;

    { //Load image data using stb (w/ user defined read/seek functions and memory allocation functions)
        stbi_set_flip_vertically_on_load(true); //So first byte stbi_load() returns is bottom left instead of top-left of image (which is stb's default)

        i32 numOfLoadedChannels {};
        i32 desiredChannels { 4 }; //Since I still draw assuming 4 byte pixels I need 4 channels

        //Returns RGBA
        unsigned char* imageData = stbi_load(fileName, &result.width_pxls, &result.height_pxls, &numOfLoadedChannels, desiredChannels);
        BGZ_ASSERT(imageData, "Invalid image data!");

        i32 totalPixelCountOfImg = result.width_pxls * result.height_pxls;
        ui32* imagePixel = (ui32*)imageData;

        //Setup imageData for rendering - BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA
        i32 diff {};
        for (int numberOfPixelsFilled = 0; numberOfPixelsFilled < totalPixelCountOfImg; numberOfPixelsFilled += 8)
        {
            Array<ui8, 8> red {}, blue {}, green {}, alpha {};
            ui32* temp { imagePixel };
            for (i32 i {}; i < 8; ++i)
            {
                auto color = UnPackPixelValues(*temp, RGBA);
                f32 alphaBlend = color.a / 255.0f;
                color.rgb *= alphaBlend;
                red.At(i) = (ui8)color.r;
                green.At(i) = (ui8)color.g;
                blue.At(i) = (ui8)color.b;
                alpha.At(i) = (ui8)color.a;

                ++temp;
            };

            //Fill 2 pixels worth of blue channels
            *imagePixel++ = ((blue[0] << 0) | (blue[1] << 8) | (blue[2] << 16) | (blue[3] << 24));
            *imagePixel++ = ((blue[4] << 0) | (blue[5] << 8) | (blue[6] << 16) | (blue[7] << 24));
            //Fill 2 pixels worth of green channels
            *imagePixel++ = ((green[0] << 0) | (green[1] << 8) | (green[2] << 16) | (green[3] << 24));
            *imagePixel++ = ((green[4] << 0) | (green[5] << 8) | (green[6] << 16) | (green[7] << 24));
            //Fill 2 pixels worth of red channels
            *imagePixel++ = ((red[0] << 0) | (red[1] << 8) | (red[2] << 16) | (red[3] << 24));
            *imagePixel++ = ((red[4] << 0) | (red[5] << 8) | (red[6] << 16) | (red[7] << 24));
            //Fill 2 pixels worth of alpha channels
            *imagePixel++ = ((alpha[0] << 0) | (alpha[1] << 8) | (alpha[2] << 16) | (alpha[3] << 24));
            *imagePixel++ = ((alpha[4] << 0) | (alpha[5] << 8) | (alpha[6] << 16) | (alpha[7] << 24));
        }

        result.data = (ui8*)imageData;
    };

    result.aspectRatio = (f32)result.width_pxls / (f32)result.height_pxls;
    result.pitch_pxls = (ui32)result.width_pxls * BYTES_PER_PIXEL;

    return result;
};


So essentially I'm walking the image pointer along and grabbing the first 8 pixel's in a row, storing them in a specific color array and eventually resorting the colors. This means instead of storing things in this original format:

Pxl1:BGRA, Pxl2:BGRA, Pxl3: BGRA, Pxl4: BGRA, Pxl5:BGRA, Pxl6:BGRA, Pxl7: BGRA, Pxl8: BGRA

I'm storing things like this:

Pxl1:B Pxl2:B Pxl3:B Pxl4:B Pxl5:B Pxl6:B Pxl7:B Pxl8:B Pxl1:G Pxl2:G Pxl3:G Pxl4:G Pxl5:G Pxl6:G ect....

Eventually I take this new image memory and pass it to the renderer where I will run it through my optimzied DrawTexture function. Does this look like I am thinking correctly about loading image colors? If so, I will continue to explain where exactly I'm getting tripped up.