opencv/modules/dnn/src/layers/cpu_kernels/convolution.cpp
Zihao Mu 5229312ad2
Merge pull request #22275 from zihaomu:fp16_support_conv
DNN: FP16 support on Convolution 2D #22275 

## FP16 support on ARM platform
This PR proposes to support FP16 backend in Convolution.
For now, we only support FP16 at ARM aarch64.

In addition to adding fp16, I also added `seperateIm2col` optimization in this patch.

## How to use FP16 to speed up convolution?
```
Net net = readNet(modelPath);
net.setPreferableTarget(DNN_TARGET_CPU_FP16);
net.setInput(blob);
Mat output = net.forward();
```

### TODO List
| Task | Status | Remarks |
|:-------:|:--------:|:------------:|
| Convolution 2D FP16 | ✔️ | Done |
| Winograd FP16 | Because the current modification has reached 2k lines, winograd fp16 will be completed in the next PR. |  |
| Accuracy Test | ✔️ | Done |
| Performance Test | ✔️ | Done |
| Compiler bug | ✔️ | Done |

### Speed Test for FP 16.

**Test on M1 chip, 4 threads.**

| Model Name | FP32 (Conv+Wino) | Conv(FP16) + Wino(FP 32) |
|:-------:|:--------:|:------------:|
| ReseNet 50 | 26.0 ms | **18.05 ms** (25% speed up)|
| MobileNet V2 | 4.17 ms | **3.09 ms (29% speed up)** |

### Speed Test for `seperateIm2col` trick on X86.
**Test on AMD 5600x, 12 threads.**
| Model Name | 4.x | Patch |
|:-------:|:--------:|:------------:|
| MobileNet V2 | 5.6 ms | **3.0 ms (46% speed up)** |

### Performance Test

#### Performance Test of X86 platform: AMD 5600X, with `-perf_threas=1`
|Name of Test|4.x|patch|patch vs 4.x (x-factor)|
|---|:-:|:-:|:-:|
|Name of Test|4.x 0|fp16pr final|fp16pr final vs 4.x 0 (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.00|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.03|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.92|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.003|0.95|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.006|0.006|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.045|0.033|1.39|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.011|0.009|1.17|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.109|0.078|1.39|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.040|0.042|0.94|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.326|0.342|0.95|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.580|0.589|0.99|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.293|1.382|0.94|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.590|3.710|0.97|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.120|1.191|0.94|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.576|2.872|0.90|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.599|4.670|0.98|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.230|9.582|0.96|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|65.946|69.381|0.95|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|18.915|19.289|0.98|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.404|1.457|0.96|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|2.060|1.501|1.37|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.409|1.464|0.96|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.793|1.838|0.98|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.207|1.199|1.01|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.277|1.275|1.00|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.319|2.370|0.98|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.351|1.346|1.00|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.520|3.612|0.97|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.876|1.880|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.981|1.995|0.99|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.620|2.627|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.202|4.123|1.02|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.429|2.445|0.99|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.591|2.576|1.01|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|3.005|2.998|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.515|3.532|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.115|3.134|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.937|3.899|1.01|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.533|5.471|1.01|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.472|3.464|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.302|4.322|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.100|6.035|1.01|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.580|6.484|1.01|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|9.741|9.634|1.01|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.131|10.156|1.00|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.391|12.350|1.00|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|91.074|87.893|1.04|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|5.903|5.903|1.00|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.890|6.794|1.01|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.160|5.131|1.01|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|4.970|5.036|0.99|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|5.045|5.015|1.01|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.583|11.343|1.02|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.348|5.320|1.01|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.357|5.396|0.99|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.050|6.006|1.01|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|5.952|5.953|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.014|8.014|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.472|12.577|0.99|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|10.803|10.655|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|18.429|13.405|1.37|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.659|6.647|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.192|13.819|1.03|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.045|6.068|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.742|12.828|0.99|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.046|7.773|1.04|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.440|17.192|1.01|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.418|14.972|1.03|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.430|0.430|1.00|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.692|6.663|1.00|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.350|6.347|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.267|0.265|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.755|7.558|1.03|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.203|0.202|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.663|10.576|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.827|10.614|1.02|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.049|6.947|1.01|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|6.900|6.901|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.165|0.165|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|17.953|17.251|1.04|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.430|7.320|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.187|21.705|1.02|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.349|8.126|1.03|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.273|8.297|1.00|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.169|8.094|1.01|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.602|13.359|1.02|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.633|8.584|1.01|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|29.339|28.897|1.02|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|13.000|12.920|1.01|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.262|13.319|1.07|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.453|27.253|1.01|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.052|27.269|1.18|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|15.363|15.208|1.01|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.543|18.434|1.01|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|39.114|37.954|1.03|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.271|36.972|0.98|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.262|19.427|0.99|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.298|19.349|1.00|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.261|19.847|1.02|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.867|21.525|1.02|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.756|49.979|1.04|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|28.133|27.060|1.04|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|25.035|24.980|1.00|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.858|25.821|1.00|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.313|27.149|1.01|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.219|28.111|1.00|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|46.025|46.674|0.99|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|30.220|29.446|1.03|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|49.410|48.708|1.01|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|38.203|38.001|1.01|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|39.961|39.021|1.02|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|48.685|47.075|1.03|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|75.114|72.586|1.03|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.222|41.144|1.00|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.220|46.353|1.00|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.201|98.771|0.99|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|100.106|96.971|1.03|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|146.977|140.445|1.05|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|198.618|194.665|1.02|


#### Performance Test of ARM platform: apple M1, with `-perf_threas=1`

Min (ms)

|Name of Test|4.x|patch|4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.07|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.10|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.97|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|0.84|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.009|0.009|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.027|0.030|0.90|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.008|0.007|1.07|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.072|0.91|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.090|0.054|1.68|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.409|0.80|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.659|0.697|0.95|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.266|1.403|0.90|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.550|4.145|0.86|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.188|1.375|0.86|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.683|3.236|0.83|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.491|5.501|0.82|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.916|10.181|0.88|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.995|72.296|0.97|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|22.531|23.139|0.97|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.239|1.933|1.16|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU_FP16)|-|1.010|-|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.134|2.068|1.52|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU_FP16)|-|1.062|-|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.918|1.920|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU_FP16)|-|1.014|-|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.340|2.352|0.99|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.247|-|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.116|1.111|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU_FP16)|-|1.114|-|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.116|1.112|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|1.113|-|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.067|3.085|0.99|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.622|-|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.153|1.187|0.97|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU_FP16)|-|1.150|-|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.804|4.849|0.99|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU_FP16)|-|2.922|-|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.463|1.469|1.00|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.459|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.577|1.580|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|1.580|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.826|1.818|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|1.817|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|6.541|5.081|1.29|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|2.809|-|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.912|1.919|1.00|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.919|-|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.961|1.971|0.99|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|1.961|-|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.317|2.329|0.99|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.322|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.920|2.947|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|2.924|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.467|2.466|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|2.496|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.028|2.997|1.01|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|2.986|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.353|4.355|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|4.355|-|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.762|2.793|0.99|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.797|-|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.428|3.226|1.06|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU_FP16)|-|3.223|-|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|3.967|3.957|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU_FP16)|-|3.960|-|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|4.806|4.387|1.10|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU_FP16)|-|4.366|-|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.509|11.756|1.23|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|6.510|-|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|13.718|13.287|1.03|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.190|-|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.133|14.853|1.02|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|8.671|-|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|41.928|43.328|0.97|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|38.072|-|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.409|4.428|1.00|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.427|-|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.144|5.363|1.15|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU_FP16)|-|5.368|-|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.926|3.932|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.938|-|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.920|3.915|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.950|-|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|3.767|3.764|1.00|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|3.762|-|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|19.959|13.875|1.44|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU_FP16)|-|7.781|-|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.951|3.955|1.00|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.969|-|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.050|4.034|1.00|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.093|-|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.923|4.506|1.09|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.509|-|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.759|4.476|1.06|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.447|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|6.079|5.628|1.08|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|5.625|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.843|17.523|1.13|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.917|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.334|8.247|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU_FP16)|-|8.246|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|23.164|18.199|1.27|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.305|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.184|5.178|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|5.149|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.990|18.103|0.99|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.777|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.831|4.522|1.07|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.523|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.328|17.319|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.948|-|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|5.944|5.961|1.00|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|5.936|-|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.811|20.064|0.99|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|11.705|-|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.398|17.686|1.27|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU_FP16)|-|9.859|-|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.416|0.416|1.00|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.417|-|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.356|5.110|1.05|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|5.114|-|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|5.092|4.748|1.07|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.754|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.260|0.229|1.13|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.229|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.872|5.460|1.08|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|5.460|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.161|0.161|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.161|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.176|7.175|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.162|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.174|7.185|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|7.157|-|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.400|5.180|1.04|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.201|-|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.330|5.188|1.03|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.177|-|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.115|0.115|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.115|-|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|26.156|20.222|1.29|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU_FP16)|-|11.203|-|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.627|5.543|1.02|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.506|-|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.925|27.741|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|17.217|-|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.359|6.062|1.05|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.048|-|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|6.559|6.322|1.04|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|6.280|-|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.412|6.200|1.03|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.197|-|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.167|8.624|1.06|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU_FP16)|-|8.626|-|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|6.755|6.491|1.04|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.520|-|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.664|34.752|1.03|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|20.260|-|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.514|9.414|1.01|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.462|-|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.631|9.963|1.07|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.935|-|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.465|36.798|1.02|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|19.569|-|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|38.157|36.157|1.06|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU_FP16)|-|18.902|-|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.356|10.401|1.00|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|10.360|-|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.641|12.150|1.04|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|12.162|-|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.545|50.505|1.00|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|27.950|-|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|54.233|49.603|1.09|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|26.515|-|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|13.779|12.968|1.06|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|12.984|-|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|15.809|15.329|1.03|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|15.433|-|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|14.563|14.527|1.00|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|14.480|-|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|16.714|16.484|1.01|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|16.362|-|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.832|65.729|1.18|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|32.065|-|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.903|20.386|1.07|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|20.416|-|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.405|18.148|1.12|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|18.128|-|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.334|18.521|1.10|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|18.495|-|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.527|19.584|1.10|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|19.630|-|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.715|20.057|1.13|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|20.068|-|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|26.228|24.992|1.05|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|24.957|-|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.524|21.581|1.00|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|21.782|-|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|34.094|31.964|1.07|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|31.925|-|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|28.677|27.813|1.03|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.808|-|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|31.274|27.892|1.12|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.910|-|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|30.533|30.007|1.02|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|30.089|-|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|39.837|38.312|1.04|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|38.477|-|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|32.480|29.237|1.11|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU_FP16)|-|29.452|-|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|33.544|32.832|1.02|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|32.784|-|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|134.481|130.678|1.03|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU_FP16)|-|70.134|-|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.930|126.530|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|65.261|-|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|201.346|187.007|1.08|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|91.525|-|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|252.038|245.587|1.03|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU_FP16)|-|125.477|-|

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-05-17 09:38:33 +03:00

2106 lines
84 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/
#include "../../precomp.hpp"
#include "convolution.hpp"
#include "conv_block.simd.hpp"
#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR);
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
Ptr<FastConv> initFastConv(
InputArray _weightsMat,
float* srcBias,
int ngroups,
int K, int C,
const std::vector<size_t>& kernel_size,
const std::vector<size_t>& strides,
const std::vector<size_t>& dilations,
const std::vector<size_t>& pads_begin,
const std::vector<size_t>& pads_end,
int conv_dim,
const bool _useFP16,
bool useWinograd)
{
Ptr<FastConv> conv = makePtr<FastConv>();
CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0);
// Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D.
int Dk = conv_dim == CONV_3D ? (int)kernel_size[0] : 1;
int Hk = conv_dim == CONV_1D ? 1 : (int)kernel_size[kernel_size.size() - 2];
int Wk = (int)kernel_size.back();
int karea = Wk*Hk*Dk;
conv->pad_front = conv_dim == CONV_3D ? (int)pads_begin[0] : 0;
conv->pad_top = conv_dim == CONV_1D ? 0 : (int)pads_begin[pads_begin.size() - 2];
conv->pad_left = (int)pads_begin.back();
conv->pad_behind = conv_dim == CONV_3D ? (int)pads_end[0] : 0;
conv->pad_bottom = conv_dim == CONV_1D ? 0 : (int)pads_end[pads_end.size() - 2];
conv->pad_right = (int)pads_end.back();
int stride_d = conv_dim == CONV_3D ? (int)strides[0] : 0;
int stride_h = conv_dim == CONV_1D ? 0 : (int)strides[strides.size() - 2];
int stride_w = (int)strides.back();
int dilation_d = conv_dim == CONV_3D ? (int)dilations[0] : 1;
int dilation_h = conv_dim == CONV_1D ? 1 : (int)dilations[dilations.size() - 2];
int dilation_w = (int)dilations.back();
CV_Assert(Dk > 0 && Hk > 0 && Wk > 0);
CV_Assert(stride_d >= 0 && stride_h >= 0 && stride_w > 0);
CV_Assert(dilation_d > 0 && dilation_h > 0 && dilation_w > 0);
conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk, conv->Dk = Dk;
conv->stride_d = stride_d;
conv->stride_h = stride_h;
conv->stride_w = stride_w;
conv->dilation_d = dilation_d;
conv->dilation_h = dilation_h;
conv->dilation_w = dilation_w;
conv->conv_dim = conv_dim;
conv->ngroups = ngroups;
bool ifRunDepthWise = ngroups > 1 && ngroups == K && ngroups == C;
bool ifRunDepthWiseRemain = false; // It's for big padding or big kernel or Conv3D depth-wise convolution.
if (ifRunDepthWise)
{
if (conv_dim == CONV_1D)
{
ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
&& max(stride_w, dilation_w) >= conv->pad_left && conv->pad_left <= 1;
}
else if (conv_dim == CONV_2D)
{
ifRunDepthWise &= Hk == 3 && Wk == 3 && ((stride_w == 1) || (stride_w == 2 && dilation_w == 1)) &&
max(stride_w, dilation_w) >= conv->pad_left && max(stride_h, dilation_h) >= conv->pad_top
&& conv->pad_left <= 1 && conv->pad_top <= 1;
}
if (!ifRunDepthWise || conv_dim == CONV_3D)
{
ifRunDepthWise = false;
ifRunDepthWiseRemain = true;
}
}
conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE :
useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX || conv->useAVX2 || conv->useNEON) &&
Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ?
CONV_TYPE_WINOGRAD3X3 :
(ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC);
#if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX || CV_TRY_AVX2)
if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 ,CV_TRY_AVX and CV_TRY_AVX2 are not available.
conv->conv_type = CONV_TYPE_GENERIC;
#endif
Mat weightsMat = _weightsMat.getMat();
auto wShape = shape(weightsMat);
const size_t wstep = weightsMat.step1();
conv->useFP16 = false;
#ifdef CONV_ARM_FP16
// TODO: add FP16 support for Winograd.
if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN))
conv->useFP16 = true;
#endif
float *srcWeights = (float *)weightsMat.data;
if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
// Handle the Conv1D, Conv2D and Conv3D depth-wise.
// for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
// but add some padding to make the weights array layout more SIMD-friendly
int ksize = karea;
// TODO: simplify the following code with std::copy.
// this code aims to let memory fit with vector size.
int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
int nweights = C * padded_ksize;
#ifdef CONV_ARM_FP16
if (conv->useFP16)
{
conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN * sizeof(float16_t ));
memset(conv->weightsBufPtr_FP16, 0, nweights * sizeof(float16_t ));
auto weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
parallel_for_(Range(0, C), [&](const Range& r0){
for(int c = r0.start; c < r0.end; c++)
{
for (int k = 0; k < ksize; k++)
weightsBufPtr_FP16[c*padded_ksize + k] = (float16_t)srcWeights[c*wstep + k];
}});
}
else
#endif
{
conv->weightsBuf.resize(nweights + VEC_ALIGN);
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN * sizeof(float ));
memset(conv->weightsBufPtr, 0, nweights*sizeof(float ));
auto weightsBufPtr = conv->weightsBufPtr;
parallel_for_(Range(0, C), [&](const Range& r0){
for(int c = r0.start; c < r0.end; c++)
{
for (int k = 0; k < ksize; k++)
weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
}});
}
}
else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
{
static const float ktm[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{32.f/45, 16.f/45, 8.f/45},
{32.f/45, -16.f/45, 8.f/45},
{0.0f, 0.0f, 1.0f}
};
// the weights are packed as 6-dim tensor:
// ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
// where W is the size of Winograd-transformed kernel (8x8),
// ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
// KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE;
int Cg = C/ngroups;
int Kg = K/ngroups;
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
float* wptrWino = nullptr;
#ifdef CONV_ARM_FP16
float16_t* wptrWino_FP16 = nullptr;
if (conv->useFP16)
{
conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
conv->weightsWinoBufPtr_FP16 = alignPtr(conv->weightsWinoBuf_FP16.data(), VEC_ALIGN);
wptrWino_FP16 = conv->weightsWinoBufPtr_FP16;
memset(wptrWino_FP16, 0, nweights * sizeof(wptrWino_FP16[0]));
}
else
#endif
{
conv->weightsWinoBuf.resize(nweights + VEC_ALIGN);
conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
wptrWino = conv->weightsWinoBufPtr;
memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
}
parallel_for_(Range(0, K), [&](const Range& r0){
float kernelTm[CONV_WINO_AREA];
for (int k = r0.start; k < r0.end; k++)
{
int g = k / Kg;
int k_ = k - g*Kg;
int ki = k_ / CONV_WINO_KBLOCK;
int dk = k_ - ki*CONV_WINO_KBLOCK;
for (int c = 0; c < Cg; c++)
{
// wstep = Hk*Wk*Cg
const float *kernel0 = srcWeights + k * wstep + c * ksize;
// transform kernel, transposed
const float *k0 = kernel0;
const float *k1 = kernel0 + 3;
const float *k2 = kernel0 + 6;
// h
float tmp[8][3];
for (int i = 0; i < 8; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// v
for (int j = 0; j < 8; j++)
{
float *tmpp = &tmp[j][0];
for (int i = 0; i < 8; i++)
kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
// repack the data.
#ifdef CONV_ARM_FP16
if (conv->useFP16)
{
float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
(c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
{
CV_Assert(conv->weightsWinoBufPtr_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= conv->weightsWinoBufPtr_FP16 + nweights);
for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
{
wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
}
}
}
else
#endif
{
float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
(c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
{
CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
}
}
}
}
});
}
else if (conv->conv_type == CONV_TYPE_GENERIC)
{
// The weights are packed as
// ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor
int Kg = K/ngroups, Cg = max(C/ngroups, 1);
int DkHkWkCg = Dk*Hk*Wk*Cg;
int numStripsMR = (Kg + CONV_MR_FP32 - 1) / CONV_MR_FP32;
int Kg_aligned = numStripsMR * CONV_MR_FP32;
size_t nweights = ngroups*Kg_aligned*DkHkWkCg;
float* weightsBufPtr = nullptr;
#ifdef CONV_ARM_FP16
int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
float16_t* weightsBufPtr_FP16 = nullptr;
if (conv->useFP16)
{
conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN);
weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
memset(weightsBufPtr_FP16, 0, nweights_FP16*sizeof(weightsBufPtr_FP16[0]));
}
else
#endif
{
conv->weightsBuf.resize(nweights + VEC_ALIGN);
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
weightsBufPtr = conv->weightsBufPtr;
memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
}
// Pack the weight.
#ifdef CONV_ARM_FP16
if (conv->useFP16)
{
parallel_for_(Range(0, ngroups * numStripsMR_FP16), [&](const Range& r0){
for (int gsi = r0.start; gsi < r0.end; gsi++)
{
int g = gsi / numStripsMR_FP16;
int si = gsi - g * numStripsMR_FP16;
int startK = si * CONV_MR_FP16;
CV_Assert(startK < Kg_aligned_FP16);
float16_t* packed_wptr = weightsBufPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
int k_idx = g*Kg + startK;
for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
{
for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP16)
{
const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
int k = 0;
for(; k < dk; k++, wptr += wstep)
packed_wptr[k] = (float16_t)(*wptr);
for(; k < CONV_MR_FP16; k++)
packed_wptr[k] = (float16_t)0.f;
}
}
}});
}
else
#endif
{
parallel_for_(Range(0, ngroups * numStripsMR), [&](const Range& r0){
for (int gsi = r0.start; gsi < r0.end; gsi++)
{
int g = gsi / numStripsMR;
int si = gsi - g * numStripsMR;
int startK = si * CONV_MR_FP32;
CV_Assert(startK < Kg_aligned);
float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned);
int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding.
int k_idx = g*Kg + startK;
for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
{
for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP32)
{
const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
int k = 0;
for(; k < dk; k++, wptr += wstep)
packed_wptr[k] = *wptr;
for(; k < CONV_MR_FP32; k++)
packed_wptr[k] = 0.f;
}
}
}});
}
}
else
CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type.");
// store bias; append some zero's to make sure that
// we can always read MR elements starting from any valid index
{
int k = 0, nbias = K + VEC_ALIGN;
conv->biasBuf.resize(nbias);
float* biasBufPtr = conv->biasBuf.data();
for(; k < K; k++)
biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
for(; k < nbias; k++)
biasBufPtr[k] = 0.f;
}
return conv;
}
static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
const int stride_w, const int ksize, const int esz)
{
char * inpbufC = inpbuf + s0 * esz;
float* inptrInC = (float* )inptrIn;
#ifdef CONV_ARM_FP16
float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
if (esz == sizeof(float16_t))
{
if (stride_w == 1)
{
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float32x4_t v0 = vld1q_f32(inptrInC + k1);
float32x4_t v1 = vld1q_f32(inptrInC + k1 + 4);
vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
}
}
else
{
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float32x4_t v0, v1;
v0[0] = inptrInC[k1];
v0[1] = inptrInC[k1 + stride_w];
v0[2] = inptrInC[k1 + 2*stride_w];
v0[3] = inptrInC[k1 + 3*stride_w];
v1[0] = inptrInC[k1 + 4*stride_w];
v1[1] = inptrInC[k1 + 5*stride_w];
v1[2] = inptrInC[k1 + 6*stride_w];
v1[3] = inptrInC[k1 + 7*stride_w];
vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
}
}
}
else // float 32
#endif
{
CV_Assert(esz == sizeof(float ));
float* inpbufC_FP32 = (float* )inpbufC;
if (stride_w == 1)
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
#if CV_SIMD256
vx_store(inpbufC_FP32 + k*CONV_NR, vx_load(inptrInC + k1));
#elif CV_SIMD128
v_float32x4 vv0 = v_load(inptrInC + k1);
v_float32x4 vv1 = v_load(inptrInC + k1 + 4);
v_store(inpbufC_FP32 + k*CONV_NR_FP32, vv0);
v_store(inpbufC_FP32 + k*CONV_NR_FP32 + 4, vv1);
#else
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + 1];
float v2 = inptrInC[k1 + 2];
float v3 = inptrInC[k1 + 3];
float v4 = inptrInC[k1 + 4];
float v5 = inptrInC[k1 + 5];
float v6 = inptrInC[k1 + 6];
float v7 = inptrInC[k1 + 7];
inpbufC_FP32[k*CONV_NR_FP32] = v0;
inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
#endif
}
else
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
float v2 = inptrInC[k1 + 2*stride_w];
float v3 = inptrInC[k1 + 3*stride_w];
float v4 = inptrInC[k1 + 4*stride_w];
float v5 = inptrInC[k1 + 5*stride_w];
float v6 = inptrInC[k1 + 6*stride_w];
float v7 = inptrInC[k1 + 7*stride_w];
inpbufC_FP32[k*CONV_NR_FP32] = v0;
inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
}
}
x0+=7;
s0+=7;
inptrIn += 7*stride_w;
in_w += 7*stride_w;
}
static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
const int stride_w, const int ksize, const int esz)
{
char* inpbufC = inpbuf + s0 * esz;
float* inptrInC = inptrIn;
#ifdef CONV_ARM_FP16
float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
if (esz == sizeof(float16_t))
{
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
}
} else
#endif
{
float * inpbufC_FP32 = (float *)inpbufC;
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC_FP32[k*CONV_NR_FP32] = v0;
inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
}
}
x0++;
s0++;
inptrIn += stride_w;
in_w += stride_w;
}
#ifdef CONV_ARM_FP16
// Fast convert float 32 to float16
static inline void _cvt32f16f( const float* src, float16_t* dst, int len)
{
int j = 0;
const int VECSZ = 4;
__fp16* dst_FP16 = (__fp16 *)dst;
if (len > VECSZ * 4)
{
const int VECSZ4 = 4 * VECSZ;
for( ; j + VECSZ4 < len; j += VECSZ4)
{
float32x4_t v0 = vld1q_f32(src + j);
float32x4_t v1 = vld1q_f32(src + j + 4);
float32x4_t v2 = vld1q_f32(src + j + 8);
float32x4_t v3 = vld1q_f32(src + j + 12);
vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
}
}
for( ; j < len; j += VECSZ )
{
if( j > len - VECSZ )
{
if( j == 0 )
break;
j = len - VECSZ;
}
float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
vst1_f16(dst_FP16 + j, hv);
}
for( ; j < len; j++ )
dst[j] = float16_t(src[j]);
}
#endif
static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit,
int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left,
int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi,
int H0, int W0, int Cg, int stripesize, int inp_plane_ofs, int inp_planesize, int conv_dim, int conv_type,
const int CONV_NR, const int esz, bool fast_1x1, bool useFP16)
{
for (int stripe = 0; zyx0 < zyx_limit; stripe++, zyx0 += CONV_NR)
{
char *inpbuf = inpbuf_task + stripe * stripesize * esz;
float *inptr = inp + inp_plane_ofs;
/*
1. pack the data. Copy the HkxWk CONV_NR-wide slices from
each feature plane of the input tensor to the input buffer.
*/
if (fast_1x1)
{
int slice_len = zyx_limit - zyx0;
bool partial = slice_len < CONV_NR;
const int CONV_NR_esz = CONV_NR * esz;
// Superfast branch for 1x1 convolutions with sy=sx=1.
// in this case each feature plane can be safely treated
// as 1D array, and we just extract next portion
// of CONV_NR elements from each feature plane and
// put it together.
inptr += zyx0;
if (!partial)
{
// Make special branch where memcpy() is called with a constant buffer size.
// Compilers will likely unroll this loop properly.
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
_cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
}
else
#endif
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
memcpy(inpbuf, inptr, CONV_NR_esz);
}
else
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
{
_cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
}
}
else
#endif
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
{
memcpy(inpbuf, inptr, slice_len * esz);
memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
}
}
}
else if (conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
CV_Assert(Cg == 1);
const int HW0 = H0 * W0;
const int HWi = Hi * Wi;
int slice_len = std::min(zyx_limit - zyx0, CONV_NR);
// here some non-continuous sub-row of the row will not be
// filled from the tensor; we need to make sure that the uncovered
// elements are explicitly set to 0's. the easiest way is to
// set all the elements to 0's before the loop.
memset(inpbuf, 0, stripesize * esz);
int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
if (conv_dim == CONV_1D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;
int in_w = x0 * stride_w - pad_left;
float* inptrIn = inptr + in_w;
int s0 = slice_i;
for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
// Pack 8
if (x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else if (x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
const float* inptrInC = inptrIn;
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t *)inpbuf + s0;
for (int w = w0; w < w1; w++)
{
int imgofs = w*dilation_w;
inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
}
}
else
#endif
{
float* inpbufC = (float *)inpbuf + s0;
for (int w = w0; w < w1; w++)
{
int imgofs = w*dilation_w;
inpbufC[w*CONV_NR] = inptrInC[imgofs];
}
}
}
}
slice_i += delta;
}
}
else if (conv_dim == CONV_2D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;
int in_h = y0 * stride_h - pad_top;
int in_w = x0 * stride_w - pad_left;
float* inptrIn = inptr + in_h*Wi + in_w;
bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
int s0 = slice_i;
for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
// Pack 8
if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
const float* inptrInC = inptrIn;
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t *)inpbuf + s0;
for (int h = h0; h < h1; h++)
{
for (int w = w0; w < w1; w++)
{
int imgofs = h*(dilation_h*Wi) + w*dilation_w;
inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
}
}
}
else
#endif
{
float* inpbufC = (float *)inpbuf + s0;
for (int h = h0; h < h1; h++)
{
for (int w = w0; w < w1; w++)
{
int imgofs = h*(dilation_h*Wi) + w*dilation_w;
inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs];
}
}
}
}
}
slice_i += delta;
}
}
else if (conv_dim == CONV_3D)
{
for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;
int in_d = z0 * stride_d - pad_front;
int in_h = y0 * stride_h - pad_top;
int in_w = x0 * stride_w - pad_left;
float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w;
int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d);
bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d &&
0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
int s0 = slice_i;
for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
// Pack 8
if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
const float* inptrInC = inptrIn;
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t* )inpbuf + s0;
for ( int d = d0; d < d1; d++)
{
for (int h = h0; h < h1; h++)
{
for (int w = w0; w < w1; w++)
{
int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
}
}
}
}
else
#endif
{
float* inpbufC = (float* )inpbuf + s0;
for ( int d = d0; d < d1; d++)
{
for (int h = h0; h < h1; h++)
{
for (int w = w0; w < w1; w++)
{
int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs];
}
}
}
}
}
}
slice_i += delta;
}
}
}
else
{
const int HW0 = H0 * W0;
const int HWi = Hi * Wi;
int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0;
int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
for (int k = 0; k < ksize; k++)
{
int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2];
int i = 0, z0 = z0_, y0 = y0_, x0 = x0_;
for (; i < CONV_NR;)
{
float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
#ifdef CONV_ARM_FP16
float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
#endif
int zi = z0 * stride_d + dz - pad_front;
int yi = y0 * stride_h + dy - pad_top;
int xi = x0 * stride_w + dx - pad_left;
if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi &&
(unsigned) xi < (unsigned) Wi)
{
const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi;
if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi)
{
if (stride_w == 1)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
{
float32x4_t v0 = vld1q_f32(inptr_ki);
float32x4_t v1 = vld1q_f32(inptr_ki + 4);
vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
}
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
{
float t0 = inptr_ki[0], t1 = inptr_ki[1];
float t2 = inptr_ki[2], t3 = inptr_ki[3];
float t4 = inptr_ki[4], t5 = inptr_ki[5];
float t6 = inptr_ki[6], t7 = inptr_ki[7];
inpbuf_ki[0] = t0;
inpbuf_ki[1] = t1;
inpbuf_ki[2] = t2;
inpbuf_ki[3] = t3;
inpbuf_ki[4] = t4;
inpbuf_ki[5] = t5;
inpbuf_ki[6] = t6;
inpbuf_ki[7] = t7;
}
}
else if (stride_w == 2)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
{
float32x4_t v0, v1;
v0[0] = inptr_ki[0], v0[1] = inptr_ki[2];
v0[2] = inptr_ki[4], v0[3] = inptr_ki[6];
v1[0] = inptr_ki[8], v1[1] = inptr_ki[10];
v1[2] = inptr_ki[12], v1[3] = inptr_ki[14];
vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
}
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
{
float t0 = inptr_ki[0], t1 = inptr_ki[2];
float t2 = inptr_ki[4], t3 = inptr_ki[6];
float t4 = inptr_ki[8], t5 = inptr_ki[10];
float t6 = inptr_ki[12], t7 = inptr_ki[14];
inpbuf_ki[0] = t0;
inpbuf_ki[1] = t1;
inpbuf_ki[2] = t2;
inpbuf_ki[3] = t3;
inpbuf_ki[4] = t4;
inpbuf_ki[5] = t5;
inpbuf_ki[6] = t6;
inpbuf_ki[7] = t7;
}
}
else
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
{
float32x4_t v0, v1;
v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
v1[0] = inptr_ki[stride_w * 4], v1[1] = inptr_ki[stride_w * 5];
v1[2] = inptr_ki[stride_w * 6], v1[3] = inptr_ki[stride_w * 7];
vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
}
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
{
float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5];
float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7];
inpbuf_ki[0] = t0;
inpbuf_ki[1] = t1;
inpbuf_ki[2] = t2;
inpbuf_ki[3] = t3;
inpbuf_ki[4] = t4;
inpbuf_ki[5] = t5;
inpbuf_ki[6] = t6;
inpbuf_ki[7] = t7;
}
}
i += 8;
x0 += 8;
}
else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi)
{
if (stride_w == 1)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
{
float32x4_t v0 = vld1q_f32(inptr_ki);
vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
}
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
{
float t0 = inptr_ki[0], t1 = inptr_ki[1];
float t2 = inptr_ki[2], t3 = inptr_ki[3];
inpbuf_ki[0] = t0;
inpbuf_ki[1] = t1;
inpbuf_ki[2] = t2;
inpbuf_ki[3] = t3;
}
}
else
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
{
float32x4_t v0;
v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
}
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
{
float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
inpbuf_ki[0] = t0;
inpbuf_ki[1] = t1;
inpbuf_ki[2] = t2;
inpbuf_ki[3] = t3;
}
}
i += 4;
x0 += 4;
}
else
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
*inpbuf_ki = *inptr_ki;
i++;
x0++;
}
}
else
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
inpbuf_ki_FP16[0] = (float16_t)0.f;
}
else
#endif
for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR)
inpbuf_ki[0] = 0.f;
i++;
x0++;
}
int mask = x0 >= W0;
y0 += mask;
x0 &= mask - 1;
mask = y0 >= H0; // Only Conv 3D need jump at z0 dimension
if (mask && conv_dim != CONV_3D)
break;
z0 += mask;
y0 &= mask - 1;
}
}
}
}
}
void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
int conv_dim = conv->conv_dim;
CV_Assert_N(input.dims == output.dims,
input.size[0] == output.size[0],
conv->C == input.size[1],
conv->K == output.size[1],
input.type() == output.type(),
input.isContinuous(),
output.isContinuous());
const bool useFP16 = conv->useFP16;
Mat fusedAddMat;
if (fusedAdd)
{
CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!");
fusedAddMat = _output.getMat();
}
if (conv->conv_type == CONV_TYPE_DEPTHWISE)
{
// Depthwise-Convolution layer should not be followed by Add layer.
CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D) && !useFP16);
return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd);
}
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == outputShape.size());
ActivationLayer* activ = nullptr;
float minval = -FLT_MAX, maxval = FLT_MAX;
bool ifMinMaxAct = false;
if (actLayer)
{
Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
if (!activ_relu.empty())
{
if (activ_relu->negativeSlope == 0.0f)
{
minval = 0.0f;
ifMinMaxAct = true;
activ = nullptr;
}
else // Leaky ReLU
{
activ = actLayer.get();
}
}
else if (!activ_relu6.empty())
{
minval = activ_relu6->minValue;
maxval = activ_relu6->maxValue;
ifMinMaxAct = true;
activ = nullptr;
}
else
activ = actLayer.get();
}
else
activ = nullptr;
// TODO: support FP16 for winograd.
if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
{
CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D && !useFP16);
if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
return;
}
int N = inputShape[0], C = inputShape[1];
// input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D.
int Di = conv_dim == CONV_3D ? inputShape[2] : 1;
int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];
int ngroups = conv->ngroups;
int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk;
int D0 = conv_dim == CONV_3D ? outputShape[2] : 1;
int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int Cg = C/ngroups, Kg = K/ngroups;
const size_t inp_planesize = (size_t)Di*Hi*Wi;
const size_t out_planesize = (size_t)D0*H0*W0;
int pad_front = conv->pad_front;
int pad_top = conv->pad_top;
int pad_left = conv->pad_left;
int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int ksize = Dk*Hk*Wk;
bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
&& pad_front == 0 && pad_left == 0 && pad_top == 0;
int DkHkWkCg = Dk*Hk*Wk*Cg;
std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
int* ofstab = ofstab_.data();
int* dhwTab = ofstab + Hk*Wk*Dk;
int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
if (conv_dim == CONV_1D)
{
for( int w = 0; w < Wk; w++)
{
int dw = w*dilation_w;
dhwTab[w*3+2] = dw;
ofstab[w] = dw;
}
}
else if (conv_dim == CONV_2D)
{
for (int h = 0; h < Hk; h++)
for( int w = 0; w < Wk; w++)
{
int k = h*Wk + w;
int dh = h*dilation_h, dw = w*dilation_w;
dhwTab[k*3+1] = dh;
dhwTab[k*3+2] = dw;
ofstab[k] = dh*Wi + dw;
}
}
else
{
for (int d = 0; d < Dk; d++)
for (int h = 0; h < Hk; h++)
{
for (int w = 0; w < Wk; w++)
{
int k = d*Hk*Wk + h*Wk + w;
int dd = d*dilation_d, dh = h*dilation_h, dw = w*dilation_w;
dhwTab[k*3] = dd;
dhwTab[k*3+1] = dh;
dhwTab[k*3+2] = dw;
ofstab[k] = dd*Hi*Wi + dh*Wi + dw;
}
}
}
int CONV_NR = CONV_NR_FP32;
int CONV_MR = CONV_MR_FP32;
int esz = sizeof(float );
#ifdef CONV_ARM_FP16
if (useFP16)
{
// works at FP 16.
CONV_NR = CONV_NR_FP16;
CONV_MR = CONV_MR_FP16;
esz = sizeof(float16_t);
}
#endif
int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
// Friendly to L1 cache
const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
const int C_BLOCK_SIZE = 256;
int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR;
int Kg_aligned = Kg_nblocks * CONV_MR;
int stripes_per_plane0 = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
int stripes_per_plane = stripes_per_plane0;
if (stripes_per_plane < ntasks * 4 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
MAX_STRIPES = 1;
stripes_per_plane = 1;
}
else
Kg_nblocks = 1;
bool seperateIm2col = fast_1x1 || stripes_per_plane == 1;
int Kstripes = Kg_nblocks * stripes_per_plane;
int nsubtasks = N * ngroups * Kstripes;
size_t stripesize = alignSize(CONV_NR * ksize * Cg, VEC_ALIGN);
size_t cbufsize = alignSize(CONV_NR * K_BLOCK_SIZE * MAX_STRIPES, VEC_ALIGN);
size_t taskbufsize = cbufsize * sizeof(float );
if (!seperateIm2col)
taskbufsize += MAX_STRIPES * stripesize * esz;
size_t totalbufsize_base = taskbufsize * ntasks;
size_t totalbufsize = totalbufsize_base;
if (seperateIm2col)
totalbufsize += N * ngroups * stripes_per_plane0 * stripesize * esz;
AutoBuffer<char> inpbuf_all_;
char* inpbuf_all = nullptr;
inpbuf_all_.allocate(totalbufsize + VEC_ALIGN * sizeof(float ));
inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN * sizeof(float )));
char* inpbuf_all_0 = inpbuf_all + totalbufsize_base;
float* inp = input.ptr<float>();
float* out = output.ptr<float>();
float* fusedAddPtr0 = fusedAddMat.empty() ? 0 : fusedAddMat.ptr<float>();
// In the case of 1x1 convolution we first reorder the whole input tensor.
// In general, im2row results in Hk*Wk-x unrolling factor
// (e.g. 3*3=9x unrolling for 3x3 convolution), thus for 1x1 convolution
// the reordered tensor will take as much space as the original tensor.
if (seperateIm2col)
{
// the optional phase 1. im2row
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
for (int task_id = r0.start; task_id < r0.end; task_id++)
{
if (fast_1x1)
{
int nc0 = task_id*N*C/ntasks, nc1 = (task_id+1)*N*C/ntasks, dc = 0;
for (; nc0 < nc1; nc0 += dc)
{
int n = nc0/C, c0 = nc0 - n*C;
int g = c0 / Cg;
c0 -= g*Cg;
dc = Cg - c0 <= nc1 - nc0 ? Cg - c0 : nc1 - nc0;
float * inptr_ = inp + (size_t)nc0*inp_planesize;
char* inpbuf_ = inpbuf_all_0 + ((n*ngroups + g)*stripes_per_plane0*stripesize + c0*CONV_NR)*esz;
packInputData(inpbuf_, inptr_, ofstab, dhwTab, 0, out_planesize, ksize, stride_d, stride_h,
stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
Di, Hi, Wi, H0, W0, dc, stripesize, 0, inp_planesize, conv->conv_dim,
conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
}
}
else
{
const int allTasks = N * ngroups * stripes_per_plane0;
int ngs0 = task_id*allTasks/ntasks, ngs1 = (task_id+1)*allTasks/ntasks, ds = 0;
for (; ngs0 < ngs1; ngs0 += ds)
{
int n = ngs0 / (ngroups * stripes_per_plane0), gs0 = ngs0 - n*ngroups*stripes_per_plane0;
int g = gs0 / stripes_per_plane0, s0 = gs0 - g*stripes_per_plane0;
ds = stripes_per_plane0 - s0 <= ngs1 - ngs0 ? stripes_per_plane0 - s0 : ngs1 - ngs0;
int zyx = s0 * CONV_NR;
int zyx_limit = (s0 + ds) * CONV_NR < out_planesize ? (s0 + ds) * CONV_NR : out_planesize;
float * inptr_ = inp + (size_t)(n * ngroups + g) * Cg * inp_planesize;
char* inpbuf_ = inpbuf_all_0 + ((n * ngroups + g) * stripes_per_plane0 * stripesize + s0 * stripesize) * esz;
packInputData(inpbuf_, inptr_, ofstab, dhwTab, zyx, zyx_limit, ksize, stride_d, stride_h,
stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
Di, Hi, Wi, H0, W0, Cg, stripesize, 0, inp_planesize, conv->conv_dim,
conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
}
}
}
});
}
// Compute
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
for (int task_id = r0.start; task_id < r0.end; task_id++)
{
float * cbuf_task = (float *)(inpbuf_all + taskbufsize * task_id);
char * inpbuf_task = (char*)(cbuf_task + cbufsize);
int ngs0 = (int)((size_t)nsubtasks * task_id / ntasks);
int ngs1 = (int)((size_t)nsubtasks * (task_id+1) / ntasks);
for (int subtask = ngs0; subtask < ngs1; )
{
int ng = subtask / Kstripes;
int kzyx0 = subtask - ng * Kstripes;
int kzyx1 = kzyx0 + (ngs1 - subtask);
int n = ng / ngroups, g = ng % ngroups; // ng - n * ngroups;
size_t inp_plane_ofs = (size_t)(n * ngroups + g) * Cg * inp_planesize;
kzyx1 = kzyx1 <= Kstripes ? kzyx1 : Kstripes;
subtask += kzyx1 - kzyx0;
int k0, k1;
int zyx0, zyx_limit, zyx_block_limit = 0;
if (stripes_per_plane == 1 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
k0 = kzyx0 * CONV_MR;
k1 = kzyx1 * CONV_MR;
k1 = k1 <= Kg ? k1 : Kg;
zyx0 = 0;
zyx_limit = (int)out_planesize;
}
else
{
k0 = 0;
k1 = Kg;
zyx0 = kzyx0 * CONV_NR;
zyx_limit = kzyx1 * CONV_NR;
zyx_limit = zyx_limit < out_planesize ? zyx_limit : (int)out_planesize;
}
for (; zyx0 < zyx_limit; zyx0 = zyx_block_limit)
{
// step 1. extract part of input tensor and represent it in zigzag form
zyx_block_limit = zyx0 + CONV_NR * MAX_STRIPES;
zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit;
int nstripes = (zyx_block_limit - zyx0 + CONV_NR - 1) / CONV_NR;
CV_Assert(nstripes <= MAX_STRIPES);
if (!seperateIm2col)
{
packInputData(inpbuf_task, inp, ofstab, dhwTab, zyx0, zyx_block_limit, ksize, stride_d, stride_h,
stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs, inp_planesize, conv->conv_dim,
conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
}
char *weights = nullptr;
#ifdef CONV_ARM_FP16
if (useFP16)
{
CV_Assert(!conv->weightsBuf_FP16.empty());
weights = (char *)conv->weightsBufPtr_FP16;
}
else
#endif
{
CV_Assert(!conv->weightsBuf.empty());
weights = (char *)conv->weightsBufPtr;
}
// optional branch, only for depth-wise convolution which was implemented by generic convolution.
// In this case, CONV_MR is 1, and CONV_NR remains the same.
if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
CV_Assert(weights);
size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
float *cptr0 = cbuf_task;
weights += g * padded_ksize * esz;
int out_width = zyx_block_limit - zyx0;
float *outptr = out + outofs;
const float biasVal = *(conv->biasBuf.data() + g);
const char *inptr_ = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR) * stripesize * esz:
inpbuf_task;
for (int stripe = 0; stripe < nstripes; stripe++)
{
const char *inptr = inptr_ + stripe * stripesize * esz;
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
bool ifBuffer = outLen < CONV_NR;
float *cptr = outptr + stripe * CONV_NR;
if (ifBuffer)
{
memcpy(cptr0, cptr, outLen * sizeof(float ));
cptr = cptr0;
}
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
opt_NEON::convBlockMR1_FP16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
}
else
#endif
opt_NEON::convBlockMR1_F32(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
}
else
#endif
convBlockMR1(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
if (ifBuffer)
{
memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(float ));
}
}
if (activ)
activ->forwardSlice(outptr, outptr, out_width, out_planesize, g, g + 1);
continue;
}
CV_Assert(weights);
weights += g * Kg_aligned * DkHkWkCg * esz;
const float *biasptr = conv->biasBuf.data() + Kg * g;
int ldc = nstripes * CONV_NR;
// 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
int out_width = zyx_block_limit - zyx0;
for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
{
int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
{
int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
const char *inptr = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR)*stripesize*esz:
inpbuf_task;
inptr += (c0 * CONV_NR) * esz;
for (int stripe = 0; stripe < nstripes; stripe++, inptr += stripesize * esz)
{
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
float *cptr = cbuf_task + stripe * CONV_NR;
float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
for (int k = k0_block; k < k1_block; k += CONV_MR,
wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
{
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
else
#endif
#if CV_NEON
if (conv->useNEON)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
opt_NEON::convBlock_FP16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
}
else
#endif
opt_NEON::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
}
else
#endif
// The possible outLen range is 24 or 8~1.
convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
}
}
}
size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
const float *cptr = cbuf_task;
const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
float *outptr = out + outofs;
const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;
for (int k = k0_block; k < k1_block; k++,
cptr += ldc, cptr_fp16 += ldc, outptr += out_planesize,
pbptr += (pbptr ? out_planesize : 0))
{
float biasval = biasptr[k];
int j = 0;
#ifdef CONV_ARM_FP16
if (useFP16)
{
float32x4_t vbias = vdupq_n_f32(biasval);
float32x4_t vmax = vdupq_n_f32(maxval);
float32x4_t vmin = vdupq_n_f32(minval);
if (pbptr)
{
for (; j + 7 < out_width; j += 8)
{
float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + + j + 4)) + vbias;
v0 += vld1q_f32(pbptr + j);
v1 += vld1q_f32(pbptr + j + 4);
if (ifMinMaxAct)
{
v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
}
vst1q_f32(outptr + j, v0);
vst1q_f32(outptr + j + 4, v1);
}
}
else
{
for (; j + 7 < out_width; j += 8)
{
float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j + 4)) + vbias;
if (ifMinMaxAct)
{
v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
}
vst1q_f32(outptr + j, v0);
vst1q_f32(outptr + j + 4, v1);
}
}
if (pbptr)
{
for (; j < out_width; j++)
{
float v = (float )cptr_fp16[j] + biasval;
v += pbptr[j];
if (ifMinMaxAct)
v = std::min(std::max(v, minval), maxval);
outptr[j] = v;
}
}
else
{
for (; j < out_width; j++)
{
float v = (float )cptr_fp16[j] + biasval;
if (ifMinMaxAct)
v = std::min(std::max(v, minval), maxval);
outptr[j] = v;
}
}
}
else
#endif
{
#if CV_SIMD128
v_float32x4 vbias = v_setall_f32(biasval);
v_float32x4 vmax = v_setall_f32(maxval);
v_float32x4 vmin = v_setall_f32(minval);
if (pbptr)
{
for (; j + 7 < out_width; j += 8)
{
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
v0 = v_add(v0, v_load(pbptr + j));
v1 = v_add(v1, v_load(pbptr + j + 4));
if (ifMinMaxAct)
{
v0 = v_min(v_max(v0, vmin), vmax);
v1 = v_min(v_max(v1, vmin), vmax);
}
v_store(outptr + j, v0);
v_store(outptr + j + 4, v1);
}
}
else
{
for (; j + 7 < out_width; j += 8)
{
v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
if (ifMinMaxAct)
{
v0 = v_min(v_max(v0, vmin), vmax);
v1 = v_min(v_max(v1, vmin), vmax);
}
v_store(outptr + j, v0);
v_store(outptr + j + 4, v1);
}
}
#endif
if (pbptr)
{
for (; j < out_width; j++)
{
float v = cptr[j] + biasval;
v += pbptr[j];
if (ifMinMaxAct)
v = std::min(std::max(v, minval), maxval);
outptr[j] = v;
}
}
else
{
for (; j < out_width; j++)
{
float v = cptr[j] + biasval;
if (ifMinMaxAct)
v = std::min(std::max(v, minval), maxval);
outptr[j] = v;
}
}
}
if (activ)
activ->forwardSlice(outptr, outptr, out_width, out_planesize, Kg * g + k, Kg * g + k + 1);
}
}
}
}
}
});
}
/****************************************************************************************\
SIMD and no-SIMD code for convBlock
\****************************************************************************************/
static inline void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
std::vector<float> cbuffer(outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
float ai = a[p];
for( int j = 0; j < outLen; j++ )
cbuf[j] += b[convNR*p + j] * ai;
}
if (init_c)
{
for(int j = 0; j < outLen; j++)
{
c[j] += cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
else
{
for(int j = 0; j < outLen; j++)
{
c[j] = cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
}
#if CV_SIMD128
static inline void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
{
CV_Assert(convNR == 24);
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0;
v_float32x4 c3 = c0, c4 = c0, c5 = c0;
for (int p = 0; p < np; p++, a++, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
}
if (init_c)
{
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
c3 = v_add(c3, v_load(c + 12));
c4 = v_add(c4, v_load(c + 16));
c5 = v_add(c5, v_load(c + 20));
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
c3 = v_min(v_max(c3, vmin), vmax);
c4 = v_min(v_max(c4, vmin), vmax);
c5 = v_min(v_max(c5, vmin), vmax);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
}
static inline void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
{
CV_Assert(convNR == 12);
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0;
for (int p = 0; p < np; p++, a++, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
}
if (init_c)
{
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
}
#endif
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
#if CV_SIMD128
// The outLen represents the valid output value in CONV_NR length.
// When outLen is very small, we use the no-SIMD branch.
const int convNRby3 = convNR/3;
if (outLen > convNRby3)
{
if (convNR == 24)
convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
else if (convNR == 12)
convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
}
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
#else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
#endif
}
#if CV_SIMD128
static inline void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
a0 = v_setall_f32(a[1]);
c6 = v_fma(b0, a0, c6);
c7 = v_fma(b1, a0, c7);
c8 = v_fma(b2, a0, c8);
c9 = v_fma(b3, a0, c9);
c10 = v_fma(b4, a0, c10);
c11 = v_fma(b5, a0, c11);
a0 = v_setall_f32(a[2]);
c12 = v_fma(b0, a0, c12);
c13 = v_fma(b1, a0, c13);
c14 = v_fma(b2, a0, c14);
c15 = v_fma(b3, a0, c15);
c16 = v_fma(b4, a0, c16);
c17 = v_fma(b5, a0, c17);
a0 = v_setall_f32(a[3]);
c18 = v_fma(b0, a0, c18);
c19 = v_fma(b1, a0, c19);
c20 = v_fma(b2, a0, c20);
c21 = v_fma(b3, a0, c21);
c22 = v_fma(b4, a0, c22);
c23 = v_fma(b5, a0, c23);
}
if (!init_c)
{
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + 8));
c3 = v_add(c3, v_load(c + 12));
c4 = v_add(c4, v_load(c + 16));
c5 = v_add(c5, v_load(c + 20));
c6 = v_add(c6 , v_load(c + ldc));
c7 = v_add(c7 , v_load(c + ldc + 4));
c8 = v_add(c8 , v_load(c + ldc + 8));
c9 = v_add(c9 , v_load(c + ldc + 12));
c10 = v_add(c10, v_load(c + ldc + 16));
c11 = v_add(c11, v_load(c + ldc + 20));
c12 = v_add(c12, v_load(c + ldc*2));
c13 = v_add(c13, v_load(c + ldc*2 + 4));
c14 = v_add(c14, v_load(c + ldc*2 + 8));
c15 = v_add(c15, v_load(c + ldc*2 + 12));
c16 = v_add(c16, v_load(c + ldc*2 + 16));
c17 = v_add(c17, v_load(c + ldc*2 + 20));
c18 = v_add(c18, v_load(c + ldc*3));
c19 = v_add(c19, v_load(c + ldc*3 + 4));
c20 = v_add(c20, v_load(c + ldc*3 + 8));
c21 = v_add(c21, v_load(c + ldc*3 + 12));
c22 = v_add(c22, v_load(c + ldc*3 + 16));
c23 = v_add(c23, v_load(c + ldc*3 + 20));
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
v_store(c + ldc, c6);
v_store(c + ldc + 4, c7);
v_store(c + ldc + 8, c8);
v_store(c + ldc + 12, c9);
v_store(c + ldc + 16, c10);
v_store(c + ldc + 20, c11);
v_store(c + ldc * 2, c12);
v_store(c + ldc * 2 + 4, c13);
v_store(c + ldc * 2 + 8, c14);
v_store(c + ldc * 2 + 12, c15);
v_store(c + ldc * 2 + 16, c16);
v_store(c + ldc * 2 + 20, c17);
v_store(c + ldc * 3, c18);
v_store(c + ldc * 3 + 4, c19);
v_store(c + ldc * 3 + 8, c20);
v_store(c + ldc * 3 + 12, c21);
v_store(c + ldc * 3 + 16, c22);
v_store(c + ldc * 3 + 20, c23);
}
static inline void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
CV_Assert(convNR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b0, a1, c2);
c3 = v_fma(b1, a1, c3);
c4 = v_fma(b0, a2, c4);
c5 = v_fma(b1, a2, c5);
c6 = v_fma(b0, a3, c6);
c7 = v_fma(b1, a3, c7);
}
if (!init_c)
{
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + 4));
c2 = v_add(c2, v_load(c + ldc));
c3 = v_add(c3, v_load(c + ldc + 4));
c4 = v_add(c4, v_load(c + ldc*2));
c5 = v_add(c5, v_load(c + ldc*2 + 4));
c6 = v_add(c6, v_load(c + ldc*3));
c7 = v_add(c7, v_load(c + ldc*3 + 4));
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + ldc, c2);
v_store(c + ldc + 4, c3);
v_store(c + ldc * 2, c4);
v_store(c + ldc * 2 + 4, c5);
v_store(c + ldc * 3, c6);
v_store(c + ldc * 3 + 4, c7);
}
static inline void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
CV_Assert(convNR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b0, a1, c1);
c2 = v_fma(b0, a2, c2);
c3 = v_fma(b0, a3, c3);
}
if (!init_c)
{
c0 = v_add(c0, v_load(c));
c1 = v_add(c1, v_load(c + ldc));
c2 = v_add(c2, v_load(c + ldc*2));
c3 = v_add(c3, v_load(c + ldc*3));
}
v_store(c, c0);
v_store(c + ldc, c1);
v_store(c + ldc * 2, c2);
v_store(c + ldc * 3, c3);
}
#endif
static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR)
{
std::vector<float> cbuffer(convMR * outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
for( int i = 0; i < convMR; i++ )
{
float ai = a[convMR*p + i];
for( int j = 0; j < outLen; j++ )
cbuf[i * outLen+j] += b[convNR*p + j] * ai;
}
}
if (!init_c)
{
for(int i = 0; i < convMR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] += cbuf[i*outLen + j];
}
}
else
{
for(int i = 0; i < convMR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] = cbuf[i*outLen + j];
}
}
}
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR)
{
// The possible outLen range is [24, 8~1].
#if CV_SIMD128
CV_Assert(convMR == 4);
if (outLen > 8 && convNR == 24)
{
convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
if (outLen <= 8 && outLen > 4)
{
convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
if (outLen <= 4 && outLen > 1)
{
convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
#else
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
#endif
}
}} // namespace cv::dnn