mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 09:25:45 +08:00
Merge pull request #22275 from zihaomu:fp16_support_conv
DNN: FP16 support on Convolution 2D #22275 ## FP16 support on ARM platform This PR proposes to support FP16 backend in Convolution. For now, we only support FP16 at ARM aarch64. In addition to adding fp16, I also added `seperateIm2col` optimization in this patch. ## How to use FP16 to speed up convolution? ``` Net net = readNet(modelPath); net.setPreferableTarget(DNN_TARGET_CPU_FP16); net.setInput(blob); Mat output = net.forward(); ``` ### TODO List | Task | Status | Remarks | |:-------:|:--------:|:------------:| | Convolution 2D FP16 | ✔️ | Done | | Winograd FP16 | Because the current modification has reached 2k lines, winograd fp16 will be completed in the next PR. | | | Accuracy Test | ✔️ | Done | | Performance Test | ✔️ | Done | | Compiler bug | ✔️ | Done | ### Speed Test for FP 16. **Test on M1 chip, 4 threads.** | Model Name | FP32 (Conv+Wino) | Conv(FP16) + Wino(FP 32) | |:-------:|:--------:|:------------:| | ReseNet 50 | 26.0 ms | **18.05 ms** (25% speed up)| | MobileNet V2 | 4.17 ms | **3.09 ms (29% speed up)** | ### Speed Test for `seperateIm2col` trick on X86. **Test on AMD 5600x, 12 threads.** | Model Name | 4.x | Patch | |:-------:|:--------:|:------------:| | MobileNet V2 | 5.6 ms | **3.0 ms (46% speed up)** | ### Performance Test #### Performance Test of X86 platform: AMD 5600X, with `-perf_threas=1` |Name of Test|4.x|patch|patch vs 4.x (x-factor)| |---|:-:|:-:|:-:| |Name of Test|4.x 0|fp16pr final|fp16pr final vs 4.x 0 (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.00| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.03| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.92| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.003|0.95| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.006|0.006|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.045|0.033|1.39| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.011|0.009|1.17| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.109|0.078|1.39| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.040|0.042|0.94| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.326|0.342|0.95| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.580|0.589|0.99| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.293|1.382|0.94| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.590|3.710|0.97| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.120|1.191|0.94| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.576|2.872|0.90| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.599|4.670|0.98| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.230|9.582|0.96| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|65.946|69.381|0.95| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|18.915|19.289|0.98| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.404|1.457|0.96| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|2.060|1.501|1.37| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.409|1.464|0.96| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.793|1.838|0.98| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.207|1.199|1.01| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.277|1.275|1.00| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.319|2.370|0.98| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.351|1.346|1.00| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.520|3.612|0.97| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.876|1.880|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.981|1.995|0.99| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.620|2.627|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.202|4.123|1.02| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.429|2.445|0.99| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.591|2.576|1.01| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|3.005|2.998|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.515|3.532|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.115|3.134|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.937|3.899|1.01| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.533|5.471|1.01| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.472|3.464|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.302|4.322|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.100|6.035|1.01| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.580|6.484|1.01| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|9.741|9.634|1.01| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.131|10.156|1.00| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.391|12.350|1.00| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|91.074|87.893|1.04| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|5.903|5.903|1.00| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.890|6.794|1.01| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.160|5.131|1.01| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|4.970|5.036|0.99| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|5.045|5.015|1.01| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.583|11.343|1.02| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.348|5.320|1.01| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.357|5.396|0.99| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.050|6.006|1.01| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|5.952|5.953|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.014|8.014|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.472|12.577|0.99| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|10.803|10.655|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|18.429|13.405|1.37| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.659|6.647|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.192|13.819|1.03| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.045|6.068|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.742|12.828|0.99| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.046|7.773|1.04| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.440|17.192|1.01| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.418|14.972|1.03| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.430|0.430|1.00| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.692|6.663|1.00| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.350|6.347|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.267|0.265|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.755|7.558|1.03| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.203|0.202|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.663|10.576|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.827|10.614|1.02| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.049|6.947|1.01| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|6.900|6.901|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.165|0.165|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|17.953|17.251|1.04| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.430|7.320|1.01| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.187|21.705|1.02| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.349|8.126|1.03| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.273|8.297|1.00| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.169|8.094|1.01| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.602|13.359|1.02| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.633|8.584|1.01| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|29.339|28.897|1.02| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|13.000|12.920|1.01| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.262|13.319|1.07| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.453|27.253|1.01| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.052|27.269|1.18| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|15.363|15.208|1.01| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.543|18.434|1.01| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|39.114|37.954|1.03| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.271|36.972|0.98| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.262|19.427|0.99| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.298|19.349|1.00| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.261|19.847|1.02| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.867|21.525|1.02| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.756|49.979|1.04| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|28.133|27.060|1.04| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|25.035|24.980|1.00| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.858|25.821|1.00| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.313|27.149|1.01| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.219|28.111|1.00| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|46.025|46.674|0.99| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|30.220|29.446|1.03| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|49.410|48.708|1.01| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|38.203|38.001|1.01| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|39.961|39.021|1.02| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|48.685|47.075|1.03| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|75.114|72.586|1.03| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.222|41.144|1.00| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.220|46.353|1.00| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.201|98.771|0.99| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|100.106|96.971|1.03| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|146.977|140.445|1.05| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|198.618|194.665|1.02| #### Performance Test of ARM platform: apple M1, with `-perf_threas=1` Min (ms) |Name of Test|4.x|patch|4.x vs patch (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.07| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.10| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.97| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|0.84| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.009|0.009|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.027|0.030|0.90| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.008|0.007|1.07| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.072|0.91| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.090|0.054|1.68| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.409|0.80| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.659|0.697|0.95| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.266|1.403|0.90| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.550|4.145|0.86| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.188|1.375|0.86| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.683|3.236|0.83| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.491|5.501|0.82| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.916|10.181|0.88| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.995|72.296|0.97| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|22.531|23.139|0.97| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.239|1.933|1.16| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU_FP16)|-|1.010|-| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.134|2.068|1.52| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU_FP16)|-|1.062|-| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.918|1.920|1.00| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU_FP16)|-|1.014|-| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.340|2.352|0.99| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.247|-| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.116|1.111|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU_FP16)|-|1.114|-| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.116|1.112|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|1.113|-| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.067|3.085|0.99| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.622|-| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.153|1.187|0.97| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU_FP16)|-|1.150|-| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.804|4.849|0.99| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU_FP16)|-|2.922|-| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.463|1.469|1.00| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.459|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.577|1.580|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|1.580|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.826|1.818|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|1.817|-| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|6.541|5.081|1.29| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|2.809|-| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.912|1.919|1.00| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.919|-| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.961|1.971|0.99| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|1.961|-| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.317|2.329|0.99| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.322|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.920|2.947|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|2.924|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.467|2.466|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|2.496|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.028|2.997|1.01| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|2.986|-| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.353|4.355|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|4.355|-| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.762|2.793|0.99| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.797|-| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.428|3.226|1.06| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU_FP16)|-|3.223|-| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|3.967|3.957|1.00| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU_FP16)|-|3.960|-| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|4.806|4.387|1.10| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU_FP16)|-|4.366|-| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.509|11.756|1.23| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|6.510|-| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|13.718|13.287|1.03| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.190|-| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.133|14.853|1.02| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|8.671|-| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|41.928|43.328|0.97| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|38.072|-| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.409|4.428|1.00| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.427|-| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.144|5.363|1.15| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU_FP16)|-|5.368|-| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.926|3.932|1.00| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.938|-| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.920|3.915|1.00| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.950|-| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|3.767|3.764|1.00| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|3.762|-| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|19.959|13.875|1.44| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU_FP16)|-|7.781|-| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.951|3.955|1.00| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.969|-| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.050|4.034|1.00| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.093|-| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.923|4.506|1.09| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.509|-| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.759|4.476|1.06| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.447|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|6.079|5.628|1.08| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|5.625|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.843|17.523|1.13| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.917|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.334|8.247|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU_FP16)|-|8.246|-| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|23.164|18.199|1.27| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.305|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.184|5.178|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|5.149|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.990|18.103|0.99| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.777|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.831|4.522|1.07| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.523|-| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.328|17.319|1.00| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.948|-| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|5.944|5.961|1.00| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|5.936|-| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.811|20.064|0.99| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|11.705|-| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.398|17.686|1.27| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU_FP16)|-|9.859|-| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.416|0.416|1.00| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.417|-| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.356|5.110|1.05| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|5.114|-| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|5.092|4.748|1.07| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.754|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.260|0.229|1.13| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.229|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.872|5.460|1.08| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|5.460|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.161|0.161|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.161|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.176|7.175|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.162|-| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.174|7.185|1.00| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|7.157|-| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.400|5.180|1.04| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.201|-| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.330|5.188|1.03| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.177|-| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.115|0.115|1.00| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.115|-| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|26.156|20.222|1.29| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU_FP16)|-|11.203|-| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.627|5.543|1.02| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.506|-| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.925|27.741|1.01| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|17.217|-| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.359|6.062|1.05| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.048|-| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|6.559|6.322|1.04| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|6.280|-| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.412|6.200|1.03| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.197|-| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.167|8.624|1.06| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU_FP16)|-|8.626|-| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|6.755|6.491|1.04| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.520|-| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.664|34.752|1.03| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|20.260|-| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.514|9.414|1.01| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.462|-| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.631|9.963|1.07| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.935|-| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.465|36.798|1.02| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|19.569|-| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|38.157|36.157|1.06| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU_FP16)|-|18.902|-| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.356|10.401|1.00| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|10.360|-| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.641|12.150|1.04| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|12.162|-| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.545|50.505|1.00| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|27.950|-| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|54.233|49.603|1.09| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|26.515|-| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|13.779|12.968|1.06| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|12.984|-| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|15.809|15.329|1.03| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|15.433|-| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|14.563|14.527|1.00| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|14.480|-| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|16.714|16.484|1.01| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|16.362|-| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.832|65.729|1.18| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|32.065|-| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.903|20.386|1.07| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|20.416|-| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.405|18.148|1.12| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|18.128|-| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.334|18.521|1.10| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|18.495|-| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.527|19.584|1.10| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|19.630|-| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.715|20.057|1.13| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|20.068|-| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|26.228|24.992|1.05| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|24.957|-| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.524|21.581|1.00| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|21.782|-| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|34.094|31.964|1.07| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|31.925|-| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|28.677|27.813|1.03| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.808|-| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|31.274|27.892|1.12| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.910|-| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|30.533|30.007|1.02| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|30.089|-| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|39.837|38.312|1.04| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|38.477|-| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|32.480|29.237|1.11| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU_FP16)|-|29.452|-| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|33.544|32.832|1.02| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|32.784|-| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|134.481|130.678|1.03| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU_FP16)|-|70.134|-| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.930|126.530|1.01| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|65.261|-| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|201.346|187.007|1.08| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|91.525|-| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|252.038|245.587|1.03| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU_FP16)|-|125.477|-| ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
001a2c5195
commit
5229312ad2
@ -106,6 +106,7 @@ CV__DNN_INLINE_NS_BEGIN
|
||||
DNN_TARGET_CUDA_FP16,
|
||||
DNN_TARGET_HDDL,
|
||||
DNN_TARGET_NPU,
|
||||
DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference.
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -13,7 +13,7 @@
|
||||
namespace cv { namespace dnn {
|
||||
CV__DNN_INLINE_NS_BEGIN
|
||||
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
|
||||
#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16
|
||||
#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU || id == DNN_TARGET_CPU_FP16)
|
||||
Mutex& getInitializationMutex();
|
||||
void initializeLayerFactory();
|
||||
|
||||
|
@ -428,7 +428,6 @@ public:
|
||||
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
||||
{
|
||||
BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
|
||||
|
||||
std::vector<Mat> inputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
// prepare weightsMat where each row is aligned and has enough zero padding on the right to
|
||||
@ -1405,7 +1404,8 @@ public:
|
||||
|
||||
CV_Assert(outputs[0].size[1] % ngroups == 0);
|
||||
fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides,
|
||||
dilations, pads_begin, pads_end, conv_dim, canUseWinograd);
|
||||
dilations, pads_begin, pads_end, conv_dim,
|
||||
preferableTarget == DNN_TARGET_CPU_FP16, canUseWinograd);
|
||||
}
|
||||
|
||||
runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd);
|
||||
|
@ -8,7 +8,7 @@ namespace cv {
|
||||
namespace dnn {
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
|
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
|
||||
|
||||
@ -17,7 +17,7 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
|
||||
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
|
||||
#endif
|
||||
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
|
||||
{
|
||||
CV_Assert(convMR == 4 && convNR == 24);
|
||||
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
|
||||
@ -28,29 +28,72 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
|
||||
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
|
||||
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
|
||||
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
if (width > 16)
|
||||
{
|
||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
|
||||
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
|
||||
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
|
||||
|
||||
c00 = _mm256_fmadd_ps(b0, a0, c00);
|
||||
c01 = _mm256_fmadd_ps(b1, a0, c01);
|
||||
c02 = _mm256_fmadd_ps(b2, a0, c02);
|
||||
c00 = _mm256_fmadd_ps(b0, a0, c00);
|
||||
c01 = _mm256_fmadd_ps(b1, a0, c01);
|
||||
c02 = _mm256_fmadd_ps(b2, a0, c02);
|
||||
|
||||
c10 = _mm256_fmadd_ps(b0, a1, c10);
|
||||
c11 = _mm256_fmadd_ps(b1, a1, c11);
|
||||
c12 = _mm256_fmadd_ps(b2, a1, c12);
|
||||
c10 = _mm256_fmadd_ps(b0, a1, c10);
|
||||
c11 = _mm256_fmadd_ps(b1, a1, c11);
|
||||
c12 = _mm256_fmadd_ps(b2, a1, c12);
|
||||
|
||||
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
|
||||
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
|
||||
|
||||
c20 = _mm256_fmadd_ps(b0, a0, c20);
|
||||
c21 = _mm256_fmadd_ps(b1, a0, c21);
|
||||
c22 = _mm256_fmadd_ps(b2, a0, c22);
|
||||
c20 = _mm256_fmadd_ps(b0, a0, c20);
|
||||
c21 = _mm256_fmadd_ps(b1, a0, c21);
|
||||
c22 = _mm256_fmadd_ps(b2, a0, c22);
|
||||
|
||||
c30 = _mm256_fmadd_ps(b0, a1, c30);
|
||||
c31 = _mm256_fmadd_ps(b1, a1, c31);
|
||||
c32 = _mm256_fmadd_ps(b2, a1, c32);
|
||||
c30 = _mm256_fmadd_ps(b0, a1, c30);
|
||||
c31 = _mm256_fmadd_ps(b1, a1, c31);
|
||||
c32 = _mm256_fmadd_ps(b2, a1, c32);
|
||||
}
|
||||
}
|
||||
else if (width > 8)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
|
||||
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8);
|
||||
|
||||
c00 = _mm256_fmadd_ps(b0, a0, c00);
|
||||
c01 = _mm256_fmadd_ps(b1, a0, c01);
|
||||
|
||||
c10 = _mm256_fmadd_ps(b0, a1, c10);
|
||||
c11 = _mm256_fmadd_ps(b1, a1, c11);
|
||||
|
||||
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
|
||||
|
||||
c20 = _mm256_fmadd_ps(b0, a0, c20);
|
||||
c21 = _mm256_fmadd_ps(b1, a0, c21);
|
||||
|
||||
c30 = _mm256_fmadd_ps(b0, a1, c30);
|
||||
c31 = _mm256_fmadd_ps(b1, a1, c31);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
|
||||
b0 = _mm256_load_ps(b);
|
||||
|
||||
c00 = _mm256_fmadd_ps(b0, a0, c00);
|
||||
c10 = _mm256_fmadd_ps(b0, a1, c10);
|
||||
|
||||
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
|
||||
|
||||
c20 = _mm256_fmadd_ps(b0, a0, c20);
|
||||
c30 = _mm256_fmadd_ps(b0, a1, c30);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
@ -87,7 +130,7 @@ namespace opt_NEON
|
||||
{
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
|
||||
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
|
||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
|
||||
{
|
||||
#if CV_NEON_AARCH64
|
||||
if (convMR == 4 && convNR == 28) // AARCH64
|
||||
@ -97,44 +140,105 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
|
||||
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
|
||||
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
|
||||
|
||||
for( int p = 0; p < np; p++, a += convMR, b += convNR )
|
||||
if (width > 16)
|
||||
{
|
||||
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
|
||||
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
|
||||
for( int p = 0; p < np; p++, a += convMR, b += convNR )
|
||||
{
|
||||
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
|
||||
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
|
||||
|
||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
|
||||
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
|
||||
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
|
||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
|
||||
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
|
||||
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
|
||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
|
||||
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
|
||||
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
|
||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
|
||||
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
|
||||
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
|
||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
|
||||
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
|
||||
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
|
||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
|
||||
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
|
||||
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
|
||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
|
||||
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
|
||||
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
|
||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
|
||||
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
|
||||
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
|
||||
|
||||
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
|
||||
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
|
||||
|
||||
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
|
||||
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
|
||||
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
|
||||
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
|
||||
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
|
||||
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
|
||||
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
|
||||
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
|
||||
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
|
||||
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
|
||||
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
|
||||
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
|
||||
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
|
||||
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
|
||||
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
|
||||
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
|
||||
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
|
||||
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
|
||||
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
|
||||
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
|
||||
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
|
||||
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
|
||||
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
|
||||
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
|
||||
|
||||
b0 = vld1q_f32(b + 24);
|
||||
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
|
||||
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
|
||||
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
|
||||
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
|
||||
b0 = vld1q_f32(b + 24);
|
||||
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
|
||||
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
|
||||
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
|
||||
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
|
||||
}
|
||||
}
|
||||
else if (width > 8)
|
||||
{
|
||||
for( int p = 0; p < np; p++, a += convMR, b += convNR )
|
||||
{
|
||||
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
|
||||
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
|
||||
|
||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
|
||||
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
|
||||
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
|
||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
|
||||
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
|
||||
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
|
||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
|
||||
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
|
||||
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
|
||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
|
||||
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
|
||||
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
|
||||
|
||||
b0 = vld1q_f32(b + 12);
|
||||
|
||||
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
|
||||
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
|
||||
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
|
||||
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
|
||||
}
|
||||
}
|
||||
else if (width > 4)
|
||||
{
|
||||
for( int p = 0; p < np; p++, a += convMR, b += convNR )
|
||||
{
|
||||
float32x4_t a0 = vld1q_f32(a), b0, b1;
|
||||
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4);
|
||||
|
||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
|
||||
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
|
||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
|
||||
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
|
||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
|
||||
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
|
||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
|
||||
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( int p = 0; p < np; p++, a += convMR, b += convNR )
|
||||
{
|
||||
float32x4_t a0 = vld1q_f32(a), b0;
|
||||
b0 = vld1q_f32(b);
|
||||
|
||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
|
||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
|
||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
|
||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
|
||||
}
|
||||
}
|
||||
|
||||
if (!init_c)
|
||||
@ -204,26 +308,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
|
||||
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
|
||||
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
if (width > 8)
|
||||
{
|
||||
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
|
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
|
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
|
||||
|
||||
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
|
||||
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
|
||||
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
|
||||
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
|
||||
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
|
||||
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
|
||||
|
||||
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
|
||||
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
|
||||
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
|
||||
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
|
||||
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
|
||||
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
|
||||
|
||||
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
|
||||
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
|
||||
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
|
||||
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
|
||||
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
|
||||
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
|
||||
|
||||
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
|
||||
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
|
||||
c11 = vmlaq_lane_f32(c11, b2, a1, 1);
|
||||
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
|
||||
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
|
||||
c11 = vmlaq_lane_f32(c11, b2, a1, 1);
|
||||
}
|
||||
}
|
||||
else if (width > 4)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
|
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
|
||||
|
||||
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
|
||||
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
|
||||
|
||||
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
|
||||
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
|
||||
|
||||
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
|
||||
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
|
||||
|
||||
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
|
||||
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int p = 0; p < np; p++, a += convMR, b += convNR)
|
||||
{
|
||||
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
|
||||
b0 = vld1q_f32(b);
|
||||
|
||||
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
|
||||
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
|
||||
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
|
||||
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!init_c)
|
||||
@ -254,6 +394,366 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
|
||||
}
|
||||
|
||||
void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const float bias, bool init_c,
|
||||
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR)
|
||||
{
|
||||
CV_Assert(convNR == 28);
|
||||
float32x4_t c0 = vdupq_n_f32(bias), c1 = c0, c2 = c0;
|
||||
float32x4_t c3 = c0, c4 = c0, c5 = c0, c6 = c0;
|
||||
|
||||
if (width > 16)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR)
|
||||
{
|
||||
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
|
||||
float32x4_t b3 = vld1q_f32(b + 12), b4 = vld1q_f32(b + 16), b5 = vld1q_f32(b + 20);
|
||||
float32x4_t b6 = vld1q_f32(b + 24);
|
||||
|
||||
c0 = vmlaq_n_f32(c0, b0, a[0]);
|
||||
c1 = vmlaq_n_f32(c1, b1, a[0]);
|
||||
c2 = vmlaq_n_f32(c2, b2, a[0]);
|
||||
c3 = vmlaq_n_f32(c3, b3, a[0]);
|
||||
c4 = vmlaq_n_f32(c4, b4, a[0]);
|
||||
c5 = vmlaq_n_f32(c5, b5, a[0]);
|
||||
c6 = vmlaq_n_f32(c6, b6, a[0]);
|
||||
}
|
||||
}
|
||||
else if (width > 8)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR)
|
||||
{
|
||||
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
|
||||
float32x4_t b3 = vld1q_f32(b + 12);
|
||||
|
||||
c0 = vmlaq_n_f32(c0, b0, a[0]);
|
||||
c1 = vmlaq_n_f32(c1, b1, a[0]);
|
||||
c2 = vmlaq_n_f32(c2, b2, a[0]);
|
||||
c3 = vmlaq_n_f32(c3, b3, a[0]);
|
||||
}
|
||||
}
|
||||
else if (width > 4)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR)
|
||||
{
|
||||
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
|
||||
|
||||
c0 = vmlaq_n_f32(c0, b0, a[0]);
|
||||
c1 = vmlaq_n_f32(c1, b1, a[0]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR)
|
||||
{
|
||||
float32x4_t b0 = vld1q_f32(b);
|
||||
c0 = vmlaq_n_f32(c0, b0, a[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if (init_c)
|
||||
{
|
||||
c0 += vld1q_f32(c);
|
||||
c1 += vld1q_f32(c + 4);
|
||||
c2 += vld1q_f32(c + 8);
|
||||
c3 += vld1q_f32(c + 12);
|
||||
c4 += vld1q_f32(c + 16);
|
||||
c5 += vld1q_f32(c + 20);
|
||||
c6 += vld1q_f32(c + 24);
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
|
||||
|
||||
c0 = vminq_f32(vmaxq_f32(c0, v_minval), v_maxval);
|
||||
c1 = vminq_f32(vmaxq_f32(c1, v_minval), v_maxval);
|
||||
c2 = vminq_f32(vmaxq_f32(c2, v_minval), v_maxval);
|
||||
c3 = vminq_f32(vmaxq_f32(c3, v_minval), v_maxval);
|
||||
c4 = vminq_f32(vmaxq_f32(c4, v_minval), v_maxval);
|
||||
c5 = vminq_f32(vmaxq_f32(c5, v_minval), v_maxval);
|
||||
c6 = vminq_f32(vmaxq_f32(c6, v_minval), v_maxval);
|
||||
}
|
||||
|
||||
vst1q_f32(c, c0);
|
||||
vst1q_f32(c + 4, c1);
|
||||
vst1q_f32(c + 8, c2);
|
||||
vst1q_f32(c + 12, c3);
|
||||
vst1q_f32(c + 16, c4);
|
||||
vst1q_f32(c + 20, c5);
|
||||
vst1q_f32(c + 24, c6);
|
||||
}
|
||||
|
||||
#if CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
// Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
|
||||
typedef __fp16 float16_t;
|
||||
|
||||
#ifndef __ARM_FEATURE_FMA // Work around without FMA support.
|
||||
#define vfmaq_f16(a, b, c) (a + b * c)
|
||||
#endif
|
||||
void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
|
||||
const int convMR_fp16, const int convNR_fp16)
|
||||
{
|
||||
#if 1
|
||||
const float16_t* a = (const float16_t*)_a;
|
||||
const float16_t* b = (const float16_t*)_b;
|
||||
float16_t* c = (float16_t*)_c;
|
||||
|
||||
CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);
|
||||
|
||||
float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
|
||||
float16x8_t c10 = c00, c11 = c00, c12 = c00;
|
||||
float16x8_t c20 = c00, c21 = c00, c22 = c00;
|
||||
float16x8_t c30 = c00, c31 = c00, c32 = c00;
|
||||
float16x8_t c40 = c00, c41 = c00, c42 = c00;
|
||||
float16x8_t c50 = c00, c51 = c00, c52 = c00;
|
||||
float16x8_t c60 = c00, c61 = c00, c62 = c00;
|
||||
float16x8_t c70 = c00, c71 = c00, c72 = c00;
|
||||
|
||||
float16x8_t b0 = c00, b1 = c00, b2 = c00;
|
||||
|
||||
if (width > 16)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
|
||||
{
|
||||
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
|
||||
b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
|
||||
|
||||
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
|
||||
c01 = vfmaq_lane_f16(c01, b1, a0, 0);
|
||||
c02 = vfmaq_lane_f16(c02, b2, a0, 0);
|
||||
|
||||
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
|
||||
c11 = vfmaq_lane_f16(c11, b1, a0, 1);
|
||||
c12 = vfmaq_lane_f16(c12, b2, a0, 1);
|
||||
|
||||
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
|
||||
c21 = vfmaq_lane_f16(c21, b1, a0, 2);
|
||||
c22 = vfmaq_lane_f16(c22, b2, a0, 2);
|
||||
|
||||
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
|
||||
c31 = vfmaq_lane_f16(c31, b1, a0, 3);
|
||||
c32 = vfmaq_lane_f16(c32, b2, a0, 3);
|
||||
|
||||
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
|
||||
c41 = vfmaq_lane_f16(c41, b1, a1, 0);
|
||||
c42 = vfmaq_lane_f16(c42, b2, a1, 0);
|
||||
|
||||
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
|
||||
c51 = vfmaq_lane_f16(c51, b1, a1, 1);
|
||||
c52 = vfmaq_lane_f16(c52, b2, a1, 1);
|
||||
|
||||
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
|
||||
c61 = vfmaq_lane_f16(c61, b1, a1, 2);
|
||||
c62 = vfmaq_lane_f16(c62, b2, a1, 2);
|
||||
|
||||
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
|
||||
c71 = vfmaq_lane_f16(c71, b1, a1, 3);
|
||||
c72 = vfmaq_lane_f16(c72, b2, a1, 3);
|
||||
}
|
||||
}
|
||||
else if (width > 8)
|
||||
{
|
||||
for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
|
||||
{
|
||||
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
|
||||
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
|
||||
|
||||
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
|
||||
c01 = vfmaq_lane_f16(c01, b1, a0, 0);
|
||||
|
||||
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
|
||||
c11 = vfmaq_lane_f16(c11, b1, a0, 1);
|
||||
|
||||
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
|
||||
c21 = vfmaq_lane_f16(c21, b1, a0, 2);
|
||||
|
||||
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
|
||||
c31 = vfmaq_lane_f16(c31, b1, a0, 3);
|
||||
|
||||
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
|
||||
c41 = vfmaq_lane_f16(c41, b1, a1, 0);
|
||||
|
||||
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
|
||||
c51 = vfmaq_lane_f16(c51, b1, a1, 1);
|
||||
|
||||
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
|
||||
c61 = vfmaq_lane_f16(c61, b1, a1, 2);
|
||||
|
||||
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
|
||||
c71 = vfmaq_lane_f16(c71, b1, a1, 3);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
|
||||
{
|
||||
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
|
||||
float16x8_t b0 = vld1q_f16(b);
|
||||
|
||||
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
|
||||
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
|
||||
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
|
||||
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
|
||||
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
|
||||
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
|
||||
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
|
||||
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
|
||||
}
|
||||
}
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
#undef _FX_UPDATE_CBUF_ROW
|
||||
#define _FX_UPDATE_CBUF_ROW(row) \
|
||||
c##row##0 = c##row##0 + vld1q_f16(c + row*ldc); \
|
||||
c##row##1 = c##row##1 + vld1q_f16(c + row*ldc + 8); \
|
||||
c##row##2 = c##row##2 + vld1q_f16(c + row*ldc + 16)
|
||||
|
||||
_FX_UPDATE_CBUF_ROW(0);
|
||||
_FX_UPDATE_CBUF_ROW(1);
|
||||
_FX_UPDATE_CBUF_ROW(2);
|
||||
_FX_UPDATE_CBUF_ROW(3);
|
||||
_FX_UPDATE_CBUF_ROW(4);
|
||||
_FX_UPDATE_CBUF_ROW(5);
|
||||
_FX_UPDATE_CBUF_ROW(6);
|
||||
_FX_UPDATE_CBUF_ROW(7);
|
||||
}
|
||||
|
||||
#undef _FX_STORE_CBUF_ROW
|
||||
#define _FX_STORE_CBUF_ROW(row) \
|
||||
vst1q_f16(c + row*ldc, c##row##0); \
|
||||
vst1q_f16(c + row*ldc + 8, c##row##1); \
|
||||
vst1q_f16(c + row*ldc + 16, c##row##2)
|
||||
|
||||
_FX_STORE_CBUF_ROW(0);
|
||||
_FX_STORE_CBUF_ROW(1);
|
||||
_FX_STORE_CBUF_ROW(2);
|
||||
_FX_STORE_CBUF_ROW(3);
|
||||
_FX_STORE_CBUF_ROW(4);
|
||||
_FX_STORE_CBUF_ROW(5);
|
||||
_FX_STORE_CBUF_ROW(6);
|
||||
_FX_STORE_CBUF_ROW(7);
|
||||
#else
|
||||
// reference only.
|
||||
const float16_t* a = (const float16_t*)_a;
|
||||
const float16_t* b = (const float16_t*)_b;
|
||||
float16_t* c = (float16_t*)_c;
|
||||
float cbuf[convMR_fp16*convNR_fp16];
|
||||
memset(cbuf, 0, sizeof(cbuf));
|
||||
|
||||
for( int p = 0; p < np; p++ )
|
||||
{
|
||||
for( int i = 0; i < convMR_fp16; i++ )
|
||||
{
|
||||
float ai = float(a[convMR_fp16*p + i]);
|
||||
for( int j = 0; j < convNR_fp16; j++ )
|
||||
cbuf[i*convNR_fp16+j] += float(b[convNR_fp16*p + j]) * ai;
|
||||
}
|
||||
}
|
||||
|
||||
if (!init_c)
|
||||
{
|
||||
for(int i = 0; i < convMR_fp16; i++)
|
||||
{
|
||||
for(int j = 0; j < convNR_fp16; j++)
|
||||
c[i*ldc + j] = float16_t(float(c[i*ldc + j]) + cbuf[i*convNR_fp16 + j]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i = 0; i < convMR_fp16; i++)
|
||||
{
|
||||
for(int j = 0; j < convNR_fp16; j++)
|
||||
c[i*ldc + j] = (float16_t)(cbuf[i*convNR_fp16 + j]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
|
||||
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
|
||||
{
|
||||
CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
|
||||
const float16_t* a = (const float16_t*)_a;
|
||||
const float16_t* b = (const float16_t*)_b;
|
||||
|
||||
const float16_t bias = (float16_t)_bias;
|
||||
|
||||
float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;
|
||||
|
||||
if (width > 16)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
|
||||
{
|
||||
float16x8_t a0= vdupq_n_f16(a[0]);
|
||||
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
|
||||
|
||||
c0 = vfmaq_f16(c0, a0, b0);
|
||||
c1 = vfmaq_f16(c1, a0, b1);
|
||||
c2 = vfmaq_f16(c2, a0, b2);
|
||||
}
|
||||
}
|
||||
else if (width > 8)
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
|
||||
{
|
||||
float16x8_t a0= vdupq_n_f16(a[0]);
|
||||
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
|
||||
|
||||
c0 = vfmaq_f16(c0, a0, b0);
|
||||
c1 = vfmaq_f16(c1, a0, b1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
|
||||
{
|
||||
float16x8_t a0= vdupq_n_f16(a[0]);
|
||||
float16x8_t b0 = vld1q_f16(b);
|
||||
|
||||
c0 = vfmaq_f16(c0, a0, b0);
|
||||
}
|
||||
}
|
||||
|
||||
// convert FP 16 to FP 32.
|
||||
float32x4_t c00 = vcvt_f32_f16(vget_low_f16(c0));
|
||||
float32x4_t c01 = vcvt_f32_f16(vget_high_f16(c0));
|
||||
float32x4_t c10 = vcvt_f32_f16(vget_low_f16(c1));
|
||||
float32x4_t c11 = vcvt_f32_f16(vget_high_f16(c1));
|
||||
float32x4_t c20 = vcvt_f32_f16(vget_low_f16(c2));
|
||||
float32x4_t c21 = vcvt_f32_f16(vget_high_f16(c2));
|
||||
|
||||
if (init_c)
|
||||
{
|
||||
c00 += vld1q_f32(c);
|
||||
c01 += vld1q_f32(c + 4);
|
||||
c10 += vld1q_f32(c + 8);
|
||||
c11 += vld1q_f32(c + 12);
|
||||
c20 += vld1q_f32(c + 16);
|
||||
c21 += vld1q_f32(c + 20);
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
|
||||
|
||||
c00 = vminq_f32(vmaxq_f32(c00, v_minval), v_maxval);
|
||||
c01 = vminq_f32(vmaxq_f32(c01, v_minval), v_maxval);
|
||||
c10 = vminq_f32(vmaxq_f32(c10, v_minval), v_maxval);
|
||||
c11 = vminq_f32(vmaxq_f32(c11, v_minval), v_maxval);
|
||||
c20 = vminq_f32(vmaxq_f32(c20, v_minval), v_maxval);
|
||||
c21 = vminq_f32(vmaxq_f32(c21, v_minval), v_maxval);
|
||||
}
|
||||
|
||||
vst1q_f32(c, c00);
|
||||
vst1q_f32(c + 4, c01);
|
||||
vst1q_f32(c + 8, c10);
|
||||
vst1q_f32(c + 12, c11);
|
||||
vst1q_f32(c + 16, c20);
|
||||
vst1q_f32(c + 20, c21);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
}} // namespace cv::dnn
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -10,14 +10,27 @@
|
||||
#ifndef CONV_PRAM
|
||||
#define CONV_PRAM
|
||||
#if CV_NEON && CV_NEON_AARCH64 // 32 registers.
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 28
|
||||
#define CONV_MR_FP32 4
|
||||
#define CONV_NR_FP32 28
|
||||
|
||||
// The FP16 can only be supported by ARM64 and with FP16 FMA supported.
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
|
||||
#define CONV_ARM_FP16 1
|
||||
#endif
|
||||
|
||||
#ifdef CONV_ARM_FP16
|
||||
// Currently, only ARM 64 support FP16.
|
||||
#define CONV_MR_FP16 8
|
||||
#define CONV_NR_FP16 24
|
||||
typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
|
||||
#endif
|
||||
|
||||
#elif CV_NEON // 16 registers.
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 12
|
||||
#define CONV_MR_FP32 4
|
||||
#define CONV_NR_FP32 12
|
||||
#else // SIMD 128, AVX or AVX2
|
||||
#define CONV_MR 4
|
||||
#define CONV_NR 24
|
||||
#define CONV_MR_FP32 4
|
||||
#define CONV_NR_FP32 24
|
||||
#endif
|
||||
|
||||
// Winograd Params
|
||||
@ -41,6 +54,10 @@ enum {
|
||||
#endif
|
||||
|
||||
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
|
||||
|
||||
// FP 16
|
||||
CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
|
||||
CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
|
||||
};
|
||||
|
||||
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
|
||||
@ -64,8 +81,17 @@ struct FastConv
|
||||
std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
|
||||
float* weightsWinoBufPtr;
|
||||
std::vector<float> biasBuf;
|
||||
|
||||
#if CV_NEON && CV_NEON_AARCH64 && CV_FP16
|
||||
std::vector<float16_t> weightsBuf_FP16;
|
||||
float16_t* weightsBufPtr_FP16;
|
||||
std::vector<float16_t> weightsWinoBuf_FP16;
|
||||
float16_t* weightsWinoBufPtr_FP16;
|
||||
#endif
|
||||
|
||||
int conv_type;
|
||||
int conv_dim; // Flag for conv1d, conv2d, or conv3d.
|
||||
bool useFP16 = false; // Only ARMv8 is supported.
|
||||
#if CV_SIMD128
|
||||
bool useSIMD128 = true;
|
||||
#else
|
||||
@ -95,6 +121,7 @@ Ptr<FastConv> initFastConv(
|
||||
const std::vector<size_t>& pads_begin,
|
||||
const std::vector<size_t>& pads_end,
|
||||
int conv_dim,
|
||||
const bool useFP16,
|
||||
bool useWinograd);
|
||||
|
||||
// It contains different computing branches, like winograd, 1x1 conv.
|
||||
|
@ -215,7 +215,7 @@ public:
|
||||
if (backendId == DNN_BACKEND_OPENCV)
|
||||
{
|
||||
if (kernel_size.size() == 3)
|
||||
return preferableTarget == DNN_TARGET_CPU;
|
||||
return IS_DNN_CPU_TARGET(preferableTarget);
|
||||
if (kernel_size.size() <= 2)
|
||||
return true;
|
||||
else
|
||||
|
@ -98,6 +98,7 @@ void Net::Impl::validateBackendAndTarget()
|
||||
|
||||
CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
|
||||
preferableTarget == DNN_TARGET_CPU ||
|
||||
preferableTarget == DNN_TARGET_CPU_FP16 ||
|
||||
preferableTarget == DNN_TARGET_OPENCL ||
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16);
|
||||
CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
|
||||
@ -972,7 +973,8 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
|
||||
}
|
||||
else if (outputBlobs.isMatVector())
|
||||
{
|
||||
if (preferableTarget != DNN_TARGET_CPU)
|
||||
// The DNN_TARGET_CPU and DNN_TARGET_CPU_FP16 both use the CPU memory, do not need the copyToHost.
|
||||
if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
|
||||
{
|
||||
@ -1336,7 +1338,7 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
|
||||
"the #%d was requested",
|
||||
ld.name.c_str(), ld.outputBlobs.size(), pin.oid));
|
||||
}
|
||||
if (preferableTarget != DNN_TARGET_CPU)
|
||||
if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
|
||||
// Transfer data to CPU if it's require.
|
||||
@ -1552,7 +1554,7 @@ string Net::Impl::dump(bool forceAllocation) const
|
||||
prevNode = itBackend->second;
|
||||
}
|
||||
}
|
||||
std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371"};
|
||||
std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371", "#ffcab3"};
|
||||
string backend;
|
||||
switch (prefBackend)
|
||||
{
|
||||
@ -1755,6 +1757,10 @@ string Net::Impl::dump(bool forceAllocation) const
|
||||
out << "NPU";
|
||||
colorId = 9;
|
||||
break;
|
||||
case DNN_TARGET_CPU_FP16:
|
||||
out << "CPU_FP16";
|
||||
colorId = 10;
|
||||
break;
|
||||
// don't use default:
|
||||
}
|
||||
CV_Assert(colorId < colors.size());
|
||||
|
@ -17,7 +17,8 @@ CV__DNN_INLINE_NS_BEGIN
|
||||
|
||||
Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
|
||||
{
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV &&
|
||||
(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16))
|
||||
return Ptr<BackendWrapper>();
|
||||
|
||||
MatShape shape(host.dims);
|
||||
@ -104,7 +105,7 @@ void Net::Impl::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
CV_TRACE_FUNCTION();
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV)
|
||||
{
|
||||
CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
|
||||
CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16 || IS_DNN_OPENCL_TARGET(preferableTarget));
|
||||
}
|
||||
else if (preferableBackend == DNN_BACKEND_HALIDE)
|
||||
{
|
||||
@ -232,6 +233,15 @@ void Net::Impl::setPreferableTarget(int targetId)
|
||||
preferableTarget = DNN_TARGET_OPENCL;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(__arm64__) || !__arm64__
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "DNN: fall back to DNN_TARGET_CPU. Only ARM v8 CPU is supported by DNN_TARGET_CPU_FP16.");
|
||||
targetId = DNN_TARGET_CPU;
|
||||
}
|
||||
#endif
|
||||
|
||||
clear();
|
||||
}
|
||||
}
|
||||
|
@ -61,6 +61,11 @@ private:
|
||||
}
|
||||
#endif
|
||||
|
||||
bool haveBackendCPU_FP16 = false;
|
||||
#if defined(__arm64__) && __arm64__
|
||||
haveBackendCPU_FP16 = true;
|
||||
#endif
|
||||
|
||||
if (haveBackendOpenVINO && openvino::checkTarget(DNN_TARGET_CPU))
|
||||
{
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
|
||||
@ -104,6 +109,9 @@ private:
|
||||
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
|
||||
|
||||
if (haveBackendCPU_FP16)
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU_FP16));
|
||||
|
||||
#ifdef HAVE_VULKAN
|
||||
if (haveVulkan())
|
||||
backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
|
||||
|
@ -175,6 +175,8 @@ TEST_P(DNNTestNetwork, ENet)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
|
||||
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
|
||||
"dnn/halide_scheduler_enet.yml",
|
||||
@ -189,7 +191,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
|
||||
Mat sample = imread(findDataFile("dnn/street.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
|
||||
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0;
|
||||
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
|
||||
float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0;
|
||||
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN;
|
||||
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
|
||||
@ -225,7 +227,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
|
||||
Mat sample = imread(findDataFile("dnn/street.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false);
|
||||
float scoreDiff = 0.0, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.029;
|
||||
iouDiff = 0.09;
|
||||
@ -242,7 +244,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
|
||||
|
||||
TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
|
||||
{
|
||||
applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
|
||||
applyTestTag((target == DNN_TARGET_CPU || target == DNN_TARGET_CPU_FP16) ? "" : CV_TEST_TAG_MEMORY_512MB);
|
||||
if (backend == DNN_BACKEND_HALIDE)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
|
||||
|
||||
@ -250,7 +252,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
|
||||
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
|
||||
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.216 : 0.2;
|
||||
float scoreDiff = 0.0, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.095;
|
||||
iouDiff = 0.09;
|
||||
@ -282,7 +284,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
|
||||
Mat sample = imread(findDataFile("dnn/street.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false);
|
||||
float scoreDiff = 0.0, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.013;
|
||||
iouDiff = 0.06;
|
||||
@ -306,7 +308,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
|
||||
Mat sample = imread(findDataFile("dnn/street.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
|
||||
float scoreDiff = 2e-5, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.013;
|
||||
iouDiff = 0.062;
|
||||
@ -332,7 +334,7 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
|
||||
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
|
||||
|
||||
float scoreDiff = 0.0, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.04;
|
||||
}
|
||||
@ -387,7 +389,7 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
|
||||
|
||||
// output range: [-0.001, 0.97]
|
||||
const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0;
|
||||
const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.2 : 0.0;
|
||||
const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.2 : 0.0;
|
||||
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
|
||||
Size(46, 46), "", "", l1, lInf);
|
||||
expectNoFallbacksFromIE(net);
|
||||
@ -461,7 +463,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
|
||||
Mat sample = imread(findDataFile("dnn/street.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
|
||||
float scoreDiff = 0.0, iouDiff = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.02;
|
||||
iouDiff = 0.1;
|
||||
@ -483,7 +485,7 @@ TEST_P(DNNTestNetwork, DenseNet_121)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
|
||||
// Reference output values are in range [-3.807, 4.605]
|
||||
float l1 = 0.0, lInf = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 2e-2;
|
||||
lInf = 9e-2;
|
||||
@ -538,6 +540,11 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
|
||||
l1 = 0.3;
|
||||
lInf = 7.6;
|
||||
}
|
||||
else if (target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.4;
|
||||
lInf = 19.;
|
||||
}
|
||||
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
|
||||
|
@ -153,7 +153,7 @@ TEST_P(Test_Caffe_nets, Axpy)
|
||||
}
|
||||
}
|
||||
float l1 = 1e-5, lInf = 1e-4;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 2e-4;
|
||||
lInf = 1e-3;
|
||||
@ -180,7 +180,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
|
||||
#else
|
||||
applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
|
||||
#endif
|
||||
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
|
||||
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
|
||||
|
||||
bool readFromMemory = get<0>(GetParam());
|
||||
Net net;
|
||||
@ -214,7 +214,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
|
||||
ASSERT_EQ(inLayerShapes[0][3], 227);
|
||||
|
||||
const float l1 = 1e-5;
|
||||
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-3 : 1e-4;
|
||||
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 4e-3 : 1e-4;
|
||||
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
net.setPreferableTarget(targetId);
|
||||
@ -308,7 +308,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
|
||||
ASSERT_EQ(out.size[2], 100);
|
||||
|
||||
float scores_diff = 1e-5, boxes_iou_diff = 1e-4;
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD)
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scores_diff = 1.5e-2;
|
||||
boxes_iou_diff = 6.3e-2;
|
||||
@ -375,7 +375,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
|
||||
{
|
||||
Target targetId = GetParam();
|
||||
applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
|
||||
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
|
||||
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
|
||||
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"),
|
||||
findDataFile("dnn/ResNet-50-model.caffemodel", false));
|
||||
@ -383,8 +383,8 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
net.setPreferableTarget(targetId);
|
||||
|
||||
float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
|
||||
float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
|
||||
float l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 3e-5 : 1e-5;
|
||||
float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 6e-3 : 1e-4;
|
||||
|
||||
Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
|
||||
ASSERT_TRUE(!input.empty());
|
||||
@ -415,6 +415,8 @@ TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
|
||||
int targetId = GetParam();
|
||||
if(targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if(targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"),
|
||||
findDataFile("dnn/squeezenet_v1.1.caffemodel", false));
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
@ -509,7 +511,7 @@ TEST_P(Test_Caffe_nets, Colorization)
|
||||
|
||||
// Reference output values are in range [-29.1, 69.5]
|
||||
double l1 = 4e-4, lInf = 3e-3;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.25;
|
||||
lInf = 5.3;
|
||||
@ -566,7 +568,7 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
|
||||
{
|
||||
l1 = 0.11; lInf = 0.5;
|
||||
}
|
||||
else if (target == DNN_TARGET_CUDA_FP16)
|
||||
else if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.04; lInf = 0.2;
|
||||
}
|
||||
@ -635,6 +637,8 @@ TEST_P(opencv_face_detector, Accuracy)
|
||||
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
Net net = readNetFromCaffe(proto, model);
|
||||
Mat img = imread(findDataFile("gpu/lbpcascade/er.png"));
|
||||
@ -665,6 +669,8 @@ TEST_P(opencv_face_detector, issue_15106)
|
||||
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
Net net = readNetFromCaffe(proto, model);
|
||||
Mat img = imread(findDataFile("cv/shared/lena.png"));
|
||||
@ -768,6 +774,8 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
|
||||
if (target == DNN_TARGET_CUDA_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
|
||||
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
|
||||
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
|
||||
@ -783,7 +791,7 @@ TEST_P(Test_Caffe_nets, RFCN)
|
||||
);
|
||||
|
||||
float scoreDiff = default_l1, iouDiff = default_lInf;
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
|
||||
{
|
||||
scoreDiff = 4e-3;
|
||||
iouDiff = 8e-2;
|
||||
|
@ -21,6 +21,7 @@
|
||||
#define CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND "dnn_skip_opencv_backend"
|
||||
#define CV_TEST_TAG_DNN_SKIP_HALIDE "dnn_skip_halide"
|
||||
#define CV_TEST_TAG_DNN_SKIP_CPU "dnn_skip_cpu"
|
||||
#define CV_TEST_TAG_DNN_SKIP_CPU_FP16 "dnn_skip_cpu_fp16"
|
||||
#define CV_TEST_TAG_DNN_SKIP_OPENCL "dnn_skip_ocl"
|
||||
#define CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 "dnn_skip_ocl_fp16"
|
||||
#define CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER "dnn_skip_ie_nn_builder"
|
||||
@ -164,7 +165,7 @@ public:
|
||||
|
||||
static void getDefaultThresholds(int backend, int target, double* l1, double* lInf)
|
||||
{
|
||||
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
{
|
||||
*l1 = 4e-3;
|
||||
*lInf = 2e-2;
|
||||
|
@ -49,6 +49,7 @@ void PrintTo(const cv::dnn::Target& v, std::ostream* os)
|
||||
case DNN_TARGET_CUDA: *os << "CUDA"; return;
|
||||
case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return;
|
||||
case DNN_TARGET_NPU: *os << "NPU"; return;
|
||||
case DNN_TARGET_CPU_FP16: *os << "CPU_FP16"; return;
|
||||
} // don't use "default:" to emit compiler warnings
|
||||
*os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
|
||||
}
|
||||
@ -439,7 +440,7 @@ void initDNNTests()
|
||||
|
||||
registerGlobalSkipTag(
|
||||
CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND,
|
||||
CV_TEST_TAG_DNN_SKIP_CPU,
|
||||
CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_CPU_FP16,
|
||||
CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16
|
||||
);
|
||||
#if defined(HAVE_HALIDE)
|
||||
|
@ -360,9 +360,9 @@ TEST_P(Test_Darknet_nets, YoloVoc)
|
||||
1, 6, 0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f, // a car
|
||||
1, 6, 0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f); // a car
|
||||
|
||||
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4;
|
||||
double nmsThreshold = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.397 : 0.4;
|
||||
double scoreDiff = 8e-5, iouDiff = 3e-4;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 1e-2;
|
||||
iouDiff = 0.018;
|
||||
@ -451,7 +451,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
|
||||
1, 6, 0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car
|
||||
|
||||
double scoreDiff = 8e-5, iouDiff = 3e-4;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 8e-3;
|
||||
iouDiff = 0.018;
|
||||
@ -636,7 +636,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
|
||||
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
|
||||
|
||||
double scoreDiff = 8e-5, iouDiff = 3e-4;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
|
||||
@ -725,8 +725,8 @@ TEST_P(Test_Darknet_nets, YOLOv4)
|
||||
};
|
||||
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
|
||||
|
||||
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : 8e-5;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.042 : 3e-4;
|
||||
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.006 : 8e-5;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.042 : 3e-4;
|
||||
if (target == DNN_TARGET_CUDA_FP16)
|
||||
{
|
||||
scoreDiff = 0.008;
|
||||
@ -847,7 +847,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
|
||||
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
|
||||
|
||||
double scoreDiff = 0.012f;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.15 : 0.01f;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.15 : 0.01f;
|
||||
if (target == DNN_TARGET_CUDA_FP16)
|
||||
iouDiff = 0.02;
|
||||
|
||||
@ -930,7 +930,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
|
||||
double scoreDiff = 8e-5;
|
||||
double iouDiff = 3e-4;
|
||||
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.006;
|
||||
iouDiff = 0.042;
|
||||
@ -1093,6 +1093,8 @@ TEST_P(Test_Darknet_layers, connected)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
testDarknetLayer("connected", true);
|
||||
}
|
||||
|
||||
|
@ -58,6 +58,8 @@ TEST_P(Reproducibility_GoogLeNet, Batching)
|
||||
const int targetId = GetParam();
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
|
||||
findDataFile("dnn/bvlc_googlenet.caffemodel", false));
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
@ -89,6 +91,8 @@ TEST_P(Reproducibility_GoogLeNet, IntermediateBlobs)
|
||||
const int targetId = GetParam();
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
|
||||
findDataFile("dnn/bvlc_googlenet.caffemodel", false));
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
@ -120,6 +124,8 @@ TEST_P(Reproducibility_GoogLeNet, SeveralCalls)
|
||||
const int targetId = GetParam();
|
||||
if (targetId == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (targetId == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
|
||||
findDataFile("dnn/bvlc_googlenet.caffemodel", false));
|
||||
net.setPreferableBackend(DNN_BACKEND_OPENCV);
|
||||
|
@ -212,6 +212,8 @@ TEST_P(Test_Caffe_layers, InnerProduct)
|
||||
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
testLayerUsingCaffeModels("layer_inner_product", true);
|
||||
}
|
||||
@ -378,7 +380,7 @@ TEST_P(Test_Caffe_layers, Eltwise)
|
||||
|
||||
TEST_P(Test_Caffe_layers, PReLU)
|
||||
{
|
||||
double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.021 : 0.0;
|
||||
double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.021 : 0.0;
|
||||
testLayerUsingCaffeModels("layer_prelu", true, true, 0.0, lInf);
|
||||
}
|
||||
|
||||
@ -2459,7 +2461,7 @@ TEST_P(ConvolutionActivationFusion, Accuracy)
|
||||
std::vector<int> expectedFusedLayers;
|
||||
if (backendId == DNN_BACKEND_OPENCV)
|
||||
{
|
||||
if (targetId == DNN_TARGET_CPU)
|
||||
if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
|
||||
expectedFusedLayers.push_back(activId); // all activations are fused
|
||||
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
@ -2594,7 +2596,7 @@ TEST_P(ConvolutionEltwiseActivationFusion, Accuracy)
|
||||
std::vector<int> expectedFusedLayers;
|
||||
if (backendId == DNN_BACKEND_OPENCV)
|
||||
{
|
||||
if (targetId == DNN_TARGET_CPU)
|
||||
if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
|
||||
expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer
|
||||
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
@ -2683,7 +2685,7 @@ TEST_P(ConvolutionActivationEltwiseFusion, Accuracy)
|
||||
std::vector<int> expectedFusedLayers;
|
||||
if (backendId == DNN_BACKEND_OPENCV)
|
||||
{
|
||||
if (targetId == DNN_TARGET_CPU)
|
||||
if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
|
||||
expectedFusedLayers.push_back(activId); // activation fused with convolution
|
||||
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
|
@ -332,7 +332,7 @@ TEST_P(Test_Model, DetectRegion)
|
||||
double confThreshold = 0.24;
|
||||
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4;
|
||||
double scoreDiff = 8e-5, iouDiff = 1e-5;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 1e-2;
|
||||
iouDiff = 1.6e-2;
|
||||
@ -392,7 +392,7 @@ TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
|
||||
double confThreshold = 0.24;
|
||||
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15;
|
||||
double scoreDiff = 8e-5, iouDiff = 1e-5;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 1e-2;
|
||||
iouDiff = 1.6e-2;
|
||||
@ -443,7 +443,7 @@ TEST_P(Test_Model, DetectionOutput)
|
||||
double scoreDiff = default_l1, iouDiff = 1e-5;
|
||||
float confThreshold = 0.8;
|
||||
double nmsThreshold = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV)
|
||||
scoreDiff = 4e-3;
|
||||
@ -495,7 +495,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
|
||||
Size size{300, 300};
|
||||
|
||||
double scoreDiff = 1e-5, iouDiff = 1e-5;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 1.7e-2;
|
||||
iouDiff = 6.91e-2;
|
||||
@ -522,6 +522,8 @@ TEST_P(Test_Model, Keypoints_pose)
|
||||
{
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
#ifdef HAVE_INF_ENGINE
|
||||
if (target == DNN_TARGET_MYRIAD)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
@ -569,7 +571,7 @@ TEST_P(Test_Model, Keypoints_face)
|
||||
|
||||
// Ref. Range: [-1.1784188, 1.7758257]
|
||||
float norm = 1e-4;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
norm = 5e-3;
|
||||
if (target == DNN_TARGET_MYRIAD)
|
||||
{
|
||||
@ -605,7 +607,7 @@ TEST_P(Test_Model, Detection_normalized)
|
||||
scoreDiff = 3e-4;
|
||||
iouDiff = 0.018;
|
||||
}
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 5e-3;
|
||||
iouDiff = 0.09;
|
||||
@ -654,7 +656,7 @@ TEST_P(Test_Model, Segmentation)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
#endif
|
||||
|
||||
if ((backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
if ((backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
|
||||
|| (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16))
|
||||
{
|
||||
norm = 2.0f; // l1 = 0.01 lInf = 2
|
||||
@ -741,6 +743,8 @@ TEST_P(Test_Model, TextDetectionByDB)
|
||||
{
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
std::string imgPath = _tf("text_det_test1.png");
|
||||
std::string weightPathDB = _tf("onnx/models/DB_TD500_resnet50.onnx", false);
|
||||
@ -801,7 +805,7 @@ TEST_P(Test_Model, TextDetectionByEAST)
|
||||
double eps_size = 5/*pixels*/;
|
||||
double eps_angle = 1;
|
||||
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
eps_center = 10;
|
||||
eps_size = 25;
|
||||
|
@ -957,7 +957,7 @@ public:
|
||||
backend = get<0>(get<1>(GetParam()));
|
||||
target = get<1>(get<1>(GetParam()));
|
||||
|
||||
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
default_l1 = 7e-3;
|
||||
default_lInf = 2e-2;
|
||||
|
@ -2179,7 +2179,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
|
||||
|
||||
// output range: [-11; 8]
|
||||
double l1 = default_l1, lInf = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.02;
|
||||
lInf = 0.2;
|
||||
|
@ -486,6 +486,11 @@ TEST_P(Test_TensorFlow_layers, slim_batch_norm)
|
||||
l1 = 0.005;
|
||||
lInf = 0.33;
|
||||
}
|
||||
else if (target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.041;
|
||||
lInf = 0.37;
|
||||
}
|
||||
|
||||
runTensorFlowNet("slim_batch_norm", false, l1, lInf);
|
||||
}
|
||||
@ -710,6 +715,9 @@ TEST_P(Test_TensorFlow_layers, matmul)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
runTensorFlowNet("matmul");
|
||||
runTensorFlowNet("nhwc_transpose_reshape_matmul");
|
||||
// Reference output values are in range [-5.688, 4.484]
|
||||
@ -723,6 +731,8 @@ TEST_P(Test_TensorFlow_layers, batch_matmul)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
runTensorFlowNet("batch_matmul");
|
||||
}
|
||||
|
||||
@ -730,6 +740,8 @@ TEST_P(Test_TensorFlow_layers, square)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
runTensorFlowNet("square");
|
||||
}
|
||||
|
||||
@ -924,7 +936,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
|
||||
Mat out = net.forward();
|
||||
|
||||
double scoreDiff = default_l1, iouDiff = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.01;
|
||||
iouDiff = 0.1;
|
||||
@ -971,7 +983,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
|
||||
0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
|
||||
|
||||
double scoreDiff = default_l1, iouDiff = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.0097;
|
||||
iouDiff = 0.09;
|
||||
@ -1004,7 +1016,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
|
||||
Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
|
||||
float scoreDiff = 1.5e-5, iouDiff = 1e-3;
|
||||
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.011;
|
||||
iouDiff = 0.012;
|
||||
@ -1053,6 +1065,8 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
|
||||
|
||||
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
checkBackend();
|
||||
|
||||
@ -1085,6 +1099,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
|
||||
}
|
||||
}
|
||||
@ -1164,6 +1181,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_resnet50_coco_2018_01_28)
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
|
||||
}
|
||||
}
|
||||
@ -1191,7 +1211,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
|
||||
Mat out = net.forward();
|
||||
|
||||
double scoreDiff = 1.1e-5, iouDiff = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 0.048;
|
||||
iouDiff = 0.058;
|
||||
@ -1230,7 +1250,7 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
|
||||
0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
|
||||
0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
|
||||
double scoreDiff = 3.4e-3, iouDiff = 1e-2;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
scoreDiff = 4e-3;
|
||||
iouDiff = 0.024;
|
||||
@ -1317,6 +1337,11 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
|
||||
lInf_scores = 0.1;
|
||||
l1_geometry = 0.3; lInf_geometry = 7;
|
||||
}
|
||||
else if (target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
lInf_scores = 0.1;
|
||||
l1_geometry = 0.28; lInf_geometry = 5.94;
|
||||
}
|
||||
else
|
||||
{
|
||||
l1_geometry = 1e-4, lInf_geometry = 4.3e-3;
|
||||
@ -1360,6 +1385,10 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_pad_and_concat)
|
||||
TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_padding_valid)
|
||||
{
|
||||
float l1 = 0.00078, lInf = 0.012;
|
||||
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
l1 = 0.00083;
|
||||
|
||||
runTensorFlowNet("fp16_padding_valid", false, l1, lInf);
|
||||
}
|
||||
TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_even)
|
||||
@ -1407,8 +1436,13 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_odd_valid)
|
||||
|
||||
TEST_P(Test_TensorFlow_layers, fp16_padding_same)
|
||||
{
|
||||
float l1 = 7e-4, lInf = 4e-3;
|
||||
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
lInf = 5e-3;
|
||||
|
||||
// Reference output values are in range [-3.504, -0.002]
|
||||
runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3);
|
||||
runTensorFlowNet("fp16_padding_same", false, l1, lInf);
|
||||
}
|
||||
|
||||
TEST_P(Test_TensorFlow_layers, defun)
|
||||
@ -1450,6 +1484,9 @@ TEST_P(Test_TensorFlow_layers, lstm)
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
|
||||
runTensorFlowNet("lstm", true);
|
||||
runTensorFlowNet("lstm", true, 0.0, 0.0, true);
|
||||
}
|
||||
@ -1771,8 +1808,8 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
|
||||
Mat outDetections = outs[0];
|
||||
Mat outMasks = outs[1];
|
||||
|
||||
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.2 : 2e-5;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.018 : default_lInf;
|
||||
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5;
|
||||
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf;
|
||||
normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff);
|
||||
|
||||
// Output size of masks is NxCxHxW where
|
||||
@ -1805,7 +1842,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
|
||||
|
||||
double inter = cv::countNonZero(masks & refMasks);
|
||||
double area = cv::countNonZero(masks | refMasks);
|
||||
EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.98 : 0.99);
|
||||
EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.98 : 0.99);
|
||||
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
|
||||
expectNoFallbacks(net);
|
||||
@ -1815,6 +1852,7 @@ TEST_P(Test_TensorFlow_nets, EfficientDet)
|
||||
{
|
||||
if (target != DNN_TARGET_CPU)
|
||||
{
|
||||
if (target == DNN_TARGET_CPU_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
|
||||
if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
|
||||
|
@ -113,7 +113,7 @@ TEST_P(Test_Torch_layers, run_convolution)
|
||||
{
|
||||
// Output reference values are in range [23.4018, 72.0181]
|
||||
double l1 = default_l1, lInf = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.08;
|
||||
lInf = 0.43;
|
||||
@ -132,6 +132,8 @@ TEST_P(Test_Torch_layers, run_pool_max)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (target == DNN_TARGET_CUDA_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
double l1 = 0.0, lInf = 0.0;
|
||||
runTorchNet("net_pool_max", "", true, false, true, l1, lInf);
|
||||
}
|
||||
@ -158,7 +160,7 @@ TEST_P(Test_Torch_layers, run_reshape_single_sample)
|
||||
{
|
||||
// Reference output values in range [14.4586, 18.4492].
|
||||
double l1 = default_l1, lInf = default_lInf;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.033;
|
||||
lInf = 0.05;
|
||||
@ -175,6 +177,8 @@ TEST_P(Test_Torch_layers, run_linear)
|
||||
{
|
||||
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
runTorchNet("net_linear_2d");
|
||||
}
|
||||
|
||||
@ -186,7 +190,7 @@ TEST_P(Test_Torch_layers, run_concat)
|
||||
TEST_P(Test_Torch_layers, run_depth_concat)
|
||||
{
|
||||
double lInf = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
lInf = 0.032;
|
||||
}
|
||||
@ -252,7 +256,7 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
|
||||
#endif
|
||||
double l1 = 0.0, lInf = 0.0;
|
||||
if (target == DNN_TARGET_OPENCL_FP16)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 0.046;
|
||||
lInf = 0.023;
|
||||
@ -369,7 +373,7 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
|
||||
// Reference output values are in range [-0.17212, 0.263492]
|
||||
// on Myriad problem layer: l4_Pooling - does not use pads_begin
|
||||
float l1 = 1e-5, lInf = 1e-3;
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
|
||||
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
l1 = 2e-3;
|
||||
lInf = 5e-3;
|
||||
@ -431,6 +435,8 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
|
||||
throw SkipTestException("");
|
||||
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
|
||||
if (target == DNN_TARGET_CPU_FP16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
@ -562,6 +568,10 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
|
||||
{
|
||||
normAssert(out, refBlob, "", 0.6, 25);
|
||||
}
|
||||
else if (target == DNN_TARGET_CPU_FP16)
|
||||
{
|
||||
normAssert(out, refBlob, "", 0.62, 25);
|
||||
}
|
||||
else
|
||||
normAssert(out, refBlob, "", 0.5, 1.1);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user