I'm trying to optimize a multi-layer convolution function for some machine learning project I'm working on.
The computation is very heavy so I'm trying to reduce the cost of convolution using sse intrinsics (initially I wanted to limit myself to sse2, but I guess sse3 is reasonable).
For efficiency each kernel is of 4*4*4 dimension. The obvious approach is to multiply and add 4 values at a time.
It's a speedup, but I wonder if there is a more efficient approach ?
Here is the code :
Any ideas ?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | // 4*4*4*c kernel convolution of a w*h*4 input layer // into a (w-3)*(h-3)*c output layer void convolve4x4xC(float *out, float *in, float *kernel, float *bias, int w, int h, int c) { int x, y, i; int w3 = w - 3; int h3 = h - 3; int w4 = w * 4; for (y = 0; y < h3; y++) { float *iny = in + (y * w4); for (x = 0; x < w3; x++) { for (i = 0; i < c; i++) { __m128 out4; float *inp, *kp; int u, v; inp = iny; kp = kernel + (i * 64); // bias out4 = _mm_set1_ps(bias[i]); // convolution for (v = 0; v < 4; v++) { for (u = 0; u < 16; u+=4) { __m128 p4 = _mm_loadu_ps(inp + u); __m128 k4 = _mm_loadu_ps(kp); p4 = _mm_mul_ps(p4, k4); out4 = _mm_add_ps(out4, p4); kp+=4; } inp+=w4; } // add the remaining 4 values (sse3) out4 = _mm_hadd_ps(out4, out4); out4 = _mm_hadd_ps(out4, out4); _mm_store_ss(out + i, out4); } out+=c; iny+=4; } } } |