Hi,
I'm trying to optimize a multi-layer convolution function for some machine learning project I'm working on.
The computation is very heavy so I'm trying to reduce the cost of convolution using sse intrinsics (initially I wanted to limit myself to sse2, but I guess sse3 is reasonable).
For efficiency each kernel is of 4*4*4 dimension. The obvious approach is to multiply and add 4 values at a time.
It's a speedup, but I wonder if there is a more efficient approach ?
Here is the code :
Any ideas ?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | // 4*4*4*c kernel convolution of a w*h*4 input layer
// into a (w-3)*(h-3)*c output layer
void convolve4x4xC(float *out, float *in, float *kernel, float *bias, int w, int h, int c)
{
int x, y, i;
int w3 = w - 3;
int h3 = h - 3;
int w4 = w * 4;
for (y = 0; y < h3; y++) {
float *iny = in + (y * w4);
for (x = 0; x < w3; x++) {
for (i = 0; i < c; i++) {
__m128 out4;
float *inp, *kp;
int u, v;
inp = iny;
kp = kernel + (i * 64);
// bias
out4 = _mm_set1_ps(bias[i]);
// convolution
for (v = 0; v < 4; v++) {
for (u = 0; u < 16; u+=4) {
__m128 p4 = _mm_loadu_ps(inp + u);
__m128 k4 = _mm_loadu_ps(kp);
p4 = _mm_mul_ps(p4, k4);
out4 = _mm_add_ps(out4, p4);
kp+=4;
}
inp+=w4;
}
// add the remaining 4 values (sse3)
out4 = _mm_hadd_ps(out4, out4);
out4 = _mm_hadd_ps(out4, out4);
_mm_store_ss(out + i, out4);
}
out+=c;
iny+=4;
}
}
}
|