Merge pull request #26348 from fengyuentau:imgproc/remap_opt

imgproc: add new remap kernels that align with the new warpAffine and warpPerspective kernels #26348 ## Performance M2: ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.213 0.185 1.15 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.213 0.187 1.14 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.417 0.355 1.18 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.973 0.908 1.07 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.563 0.507 1.11 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 3.208 3.165 1.01 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.244 0.195 1.26 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.270 0.245 1.10 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.361 0.328 1.10 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.365 1.273 1.07 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.532 0.508 1.05 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 3.651 3.545 1.03 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.272 0.097 2.80 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.304 0.148 2.06 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.271 0.125 2.16 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.406 0.178 2.28 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 0.476 0.275 1.73 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 0.354 0.256 1.38 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.382 0.168 2.28 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.555 0.338 1.64 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.385 0.307 1.25 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.271 0.099 2.75 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.301 0.145 2.07 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.270 0.120 2.24 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.408 0.180 2.27 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 0.474 0.277 1.71 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 0.352 0.261 1.35 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.382 0.166 2.29 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.552 0.339 1.63 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.380 0.308 1.24 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 1.013 0.474 2.14 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 1.155 0.705 1.64 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 1.200 0.674 1.78 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 1.614 0.986 1.64 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 2.042 1.605 1.27 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 2.275 1.647 1.38 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 1.558 0.847 1.84 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.394 2.036 1.18 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.693 2.112 1.27 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.999 0.463 2.16 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 1.194 0.699 1.71 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 1.211 0.677 1.79 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 1.619 1.045 1.55 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 2.039 1.604 1.27 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 2.257 1.657 1.36 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.578 0.845 1.87 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.405 2.032 1.18 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.669 2.107 1.27 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.277 0.104 2.66 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.310 0.149 2.08 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.275 0.122 2.26 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.412 0.177 2.33 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 0.479 0.277 1.73 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 0.360 0.253 1.43 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.388 0.173 2.24 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.575 0.337 1.71 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.387 0.307 1.26 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.274 0.100 2.73 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.312 0.144 2.16 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.278 0.128 2.18 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.407 0.178 2.29 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 0.483 0.275 1.75 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 0.358 0.250 1.43 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.389 0.168 2.31 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.563 0.338 1.66 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.390 0.312 1.25 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 1.024 0.483 2.12 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 1.224 0.770 1.59 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 1.185 0.674 1.76 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 1.633 0.922 1.77 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 2.042 1.607 1.27 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 2.244 1.647 1.36 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 1.592 0.872 1.83 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.473 2.014 1.23 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 2.604 2.127 1.22 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 1.020 0.490 2.08 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 1.193 0.733 1.63 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 1.203 0.694 1.73 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 1.642 0.923 1.78 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 2.055 1.619 1.27 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 2.210 1.658 1.33 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.642 0.883 1.86 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.463 2.077 1.19 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 2.610 2.152 1.21 ``` Intel i7-12700K: ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.146 0.055 2.66 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.146 0.055 2.65 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.301 0.138 2.18 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.490 0.329 1.49 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.390 0.194 2.01 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.286 1.190 1.08 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.140 0.058 2.40 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.157 0.078 2.02 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.234 0.117 2.01 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.550 0.472 1.16 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.334 0.199 1.68 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.361 1.347 1.01 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.146 0.046 3.18 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.174 0.045 3.88 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.150 0.036 4.21 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.195 0.120 1.63 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 0.365 0.111 3.29 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 0.217 0.106 2.05 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.177 0.054 3.30 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.451 0.143 3.15 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.276 0.139 1.98 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.142 0.046 3.06 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.182 0.045 4.00 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.154 0.036 4.31 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.196 0.120 1.63 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 0.364 0.111 3.29 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 0.221 0.107 2.07 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.177 0.054 3.31 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.488 0.143 3.42 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.280 0.139 2.01 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.480 0.290 1.66 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.698 0.288 2.43 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.613 0.322 1.90 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.665 0.808 0.82 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 1.522 0.942 1.62 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 2.504 2.204 1.14 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.619 0.376 1.64 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.018 1.397 1.44 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.582 3.157 1.13 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.481 0.293 1.64 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.698 0.288 2.42 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.606 0.321 1.88 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.669 0.806 0.83 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 1.514 0.935 1.62 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 2.472 2.203 1.12 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.618 0.378 1.63 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 1.998 1.404 1.42 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.583 3.160 1.13 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.153 0.050 3.08 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.189 0.048 3.90 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.162 0.041 3.91 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.211 0.124 1.70 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 0.384 0.113 3.39 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 0.221 0.107 2.07 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.186 0.059 3.17 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 0.465 0.147 3.16 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 0.312 0.140 2.22 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.148 0.052 2.88 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.189 0.049 3.82 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.167 0.041 4.06 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.202 0.124 1.63 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 0.383 0.113 3.39 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 0.228 0.106 2.14 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.188 0.058 3.26 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 0.467 0.147 3.17 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 0.286 0.140 2.05 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 0.519 0.311 1.67 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 0.743 0.307 2.42 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 0.646 0.329 1.96 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 0.714 0.826 0.86 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 1.567 0.939 1.67 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 2.501 2.183 1.15 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 0.670 0.389 1.72 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 2.060 1.384 1.49 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.556 3.151 1.13 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 0.517 0.312 1.66 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 0.745 0.306 2.44 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 0.651 0.332 1.96 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 0.731 0.831 0.88 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 1.574 0.934 1.68 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 2.442 2.181 1.12 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 0.666 0.390 1.71 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 2.045 1.391 1.47 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.557 3.154 1.13 ``` A311D: ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 1.335 0.936 1.43 WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.331 0.940 1.42 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 2.950 2.199 1.34 WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 6.011 5.177 1.16 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 4.415 3.533 1.25 WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 26.619 17.665 1.51 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 1.465 1.119 1.31 WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 1.776 1.416 1.25 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 4.106 2.307 1.78 WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 12.015 7.427 1.62 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 7.196 4.044 1.78 WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 32.182 29.642 1.09 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 2.358 0.751 3.14 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 3.342 0.847 3.94 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 2.863 0.941 3.04 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 4.062 1.474 2.75 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 4.937 1.681 2.94 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 3.796 2.152 1.76 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 3.838 1.341 2.86 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 5.682 2.288 2.48 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.943 3.154 1.25 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 2.346 0.754 3.11 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 3.370 0.849 3.97 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 2.841 0.934 3.04 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 4.244 1.466 2.90 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 4.882 1.680 2.91 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 3.672 2.163 1.70 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 3.822 1.349 2.83 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 5.614 2.291 2.45 map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.987 3.174 1.26 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 10.358 4.713 2.20 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 14.165 4.903 2.89 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 11.751 5.648 2.08 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 13.912 6.793 2.05 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 22.706 8.440 2.69 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 16.738 13.517 1.24 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 18.715 9.065 2.06 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 28.190 15.483 1.82 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 17.441 20.976 0.83 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 10.506 4.770 2.20 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 14.298 4.952 2.89 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 11.534 5.669 2.03 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 19.890 9.588 2.07 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 23.599 11.543 2.04 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 16.827 14.255 1.18 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 18.878 9.185 2.06 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 28.377 15.766 1.80 map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 17.337 21.134 0.82 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 2.170 0.763 2.84 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 3.035 0.959 3.17 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 2.759 0.937 2.94 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 4.074 1.484 2.74 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 4.757 1.689 2.82 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 3.766 2.165 1.74 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 3.730 1.353 2.76 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 5.623 2.301 2.44 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 3.935 3.115 1.26 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 2.236 0.761 2.94 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 3.010 0.946 3.18 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 2.750 0.933 2.95 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 4.045 1.484 2.73 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 4.785 1.694 2.83 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 3.642 2.146 1.70 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 3.710 1.357 2.73 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 5.594 2.310 2.42 map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 3.845 3.120 1.23 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC1) 10.092 4.846 2.08 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC1) 14.501 5.724 2.53 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC1) 11.698 5.709 2.05 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC3) 19.480 9.290 2.10 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC3) 23.830 11.636 2.05 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC3) 16.725 13.922 1.20 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 8UC4) 18.756 8.839 2.12 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4) 29.698 15.668 1.90 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4) 17.641 20.145 0.88 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC1) 10.128 4.883 2.07 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC1) 14.438 5.685 2.54 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC1) 11.440 5.674 2.02 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC3) 19.681 10.117 1.95 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC3) 23.757 11.623 2.04 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC3) 16.891 13.690 1.23 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 8UC4) 18.887 8.756 2.16 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4) 29.654 15.890 1.87 map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4) 17.412 20.535 0.85 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-08-06 06:26:29 +08:00 · 2024-11-12 02:44:01 +08:00 · 2024-11-12 02:44:01 +08:00 · c445a000c9
commit c445a000c9
parent a4ab68f9f4
10 changed files with 2327 additions and 596 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -1626,6 +1626,24 @@ inline v_int32 v_load_expand_q(const schar* ptr)
    return __riscv_vwcvt_x(__riscv_vwcvt_x(__riscv_vle8_v_i8mf2(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
 }

+template <int N = VTraits<v_uint32>::max_nlanes>
+inline v_uint32 v_load_expand_q(const uchar* ptr, int n = N)
+{
+    uchar buf[VTraits<v_uint8>::max_nlanes];
+    v_store(buf, v_setzero_u8());
+    for (int i = 0; i < n; i++) {
+        buf[i] = ptr[i];
+    }
+    return v_load_expand_q(buf);
+}
+template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr, int n)
+{
+    uchar buf[VTraits<v_uint8>::max_nlanes];
+    v_store(buf, v_setzero_u8());
+    buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3];
+    return v_load_expand_q(buf);
+}
+
 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
 { \
@ -1696,6 +1714,23 @@ void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32,  __riscv_vreinterpret_v_i32m4_u32m4, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())

+template <int N = VTraits<v_int16>::max_nlanes>
+inline void v_pack_u_store(uchar* ptr, const v_int16& a, int n = N)
+{
+    uchar buf[VTraits<v_uint8>::max_nlanes];
+    v_pack_u_store(buf, a);
+    for (int i = 0; i < n; i++) {
+        ptr[i] = buf[i];
+    }
+}
+template <> inline void v_pack_u_store<8>(uchar* ptr, const v_int16& a, int n)
+{
+    uchar buf[VTraits<v_uint8>::max_nlanes];
+    v_pack_u_store(buf, a);
+    ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3];
+    ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7];
+}
+

 /* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
  a0 = {A1 A2 A3 A4}
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -2474,7 +2474,8 @@ flag #WARP_INVERSE_MAP that means that M is the inverse transformation (
 borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image corresponding to
 the "outliers" in the source image are not modified by the function.
@param borderValue value used in case of a constant border; by default, it is 0.
-@param hint Implementation modfication flags. See #AlgorithmHint
+@param hint Implementation modfication flags. Set #ALGO_HINT_APPROX to use FP16 precision (if available)
+for linear calculation for faster speed. See #AlgorithmHint.

@sa  warpPerspective, resize, remap, getRectSubPix, transform
 */
@ -2508,7 +2509,8 @@ optional flag #WARP_INVERSE_MAP, that sets M as the inverse transformation (
 \f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
@param borderMode pixel extrapolation method (#BORDER_CONSTANT or #BORDER_REPLICATE).
@param borderValue value used in case of a constant border; by default, it equals 0.
-@param hint Implementation modfication flags. See #AlgorithmHint
+@param hint Implementation modfication flags. Set #ALGO_HINT_APPROX to use FP16 precision (if available)
+for linear calculation for faster speed. See #AlgorithmHint.

@sa  warpAffine, resize, remap, getRectSubPix, perspectiveTransform
 */
@ -2554,13 +2556,16 @@ The extra flag WARP_RELATIVE_MAP can be ORed to the interpolation method
 borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image that
 corresponds to the "outliers" in the source image are not modified by the function.
@param borderValue Value used in case of a constant border. By default, it is 0.
+@param hint Implementation modfication flags. Set #ALGO_HINT_APPROX to use FP16 precision (if available)
+for linear calculation for faster speed. See #AlgorithmHint.
@note
 Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
 */
 CV_EXPORTS_W void remap( InputArray src, OutputArray dst,
                         InputArray map1, InputArray map2,
                         int interpolation, int borderMode = BORDER_CONSTANT,
-                         const Scalar& borderValue = Scalar());
+                         const Scalar& borderValue = Scalar(),
+                         AlgorithmHint hint = cv::ALGO_HINT_DEFAULT);

 /** @brief Converts image transformation maps from one representation to another.

--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@ -5,19 +5,16 @@

 namespace opencv_test {

-enum{HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH};
-
 CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE)
 CV_ENUM(InterType, INTER_NEAREST, INTER_LINEAR)
 CV_ENUM(InterTypeExtended, INTER_NEAREST, INTER_LINEAR, WARP_RELATIVE_MAP)
-CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH)

 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpAffine;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpPerspective;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpPerspectiveNear_t;
-typedef TestBaseWithParam< tuple<MatType, Size, InterTypeExtended, BorderMode, RemapMode> > TestRemap;
+typedef TestBaseWithParam< tuple<Size, InterTypeExtended, BorderMode, MatType> > TestRemap;

-void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, bool relative = false );
+void update_map(const Mat& src, Mat& map_x, Mat& map_y, bool relative = false );

 PERF_TEST_P( TestWarpAffine, WarpAffine,
             Combine(
@ -156,21 +153,19 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
    SANITY_CHECK(dst, 1);
 }

-PERF_TEST_P( TestRemap, remap,
+PERF_TEST_P( TestRemap, map1_32fc1,
             Combine(
-                 Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
                 Values( szVGA, sz1080p ),
                 InterTypeExtended::all(),
                 BorderMode::all(),
-                 RemapMode::all()
+                 Values(CV_8UC3, CV_16UC3, CV_32FC3, CV_8UC1, CV_16UC1, CV_32FC1, CV_8UC4, CV_16UC4, CV_32FC4)
                 )
             )
 {
-    int type = get<0>(GetParam());
-    Size size = get<1>(GetParam());
-    int interpolationType = get<2>(GetParam());
-    int borderMode = get<3>(GetParam());
-    int remapMode = get<4>(GetParam());
+    Size size = get<0>(GetParam());
+    int interpolationType = get<1>(GetParam());
+    int borderMode = get<2>(GetParam());
+    int type = get<3>(GetParam());
    unsigned int height = size.height;
    unsigned int width = size.width;
    Mat source(height, width, type);
@ -180,7 +175,7 @@ PERF_TEST_P( TestRemap, remap,

    declare.in(source, WARMUP_RNG);

-    update_map(source, map_x, map_y, remapMode, ((interpolationType & WARP_RELATIVE_MAP) != 0));
+    update_map(source, map_x, map_y, ((interpolationType & WARP_RELATIVE_MAP) != 0));

    TEST_CYCLE()
    {
@ -190,15 +185,68 @@ PERF_TEST_P( TestRemap, remap,
    SANITY_CHECK_NOTHING();
 }

-void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, bool relative )
+PERF_TEST_P( TestRemap, map1_32fc2,
+             Combine(
+                 Values( szVGA, sz1080p ),
+                 InterTypeExtended::all(),
+                 BorderMode::all(),
+                 Values(CV_8UC3, CV_16UC3, CV_32FC3, CV_8UC1, CV_16UC1, CV_32FC1, CV_8UC4, CV_16UC4, CV_32FC4)
+                 )
+             )
 {
-    for( int j = 0; j < src.rows; j++ )
+    Size size = get<0>(GetParam());
+    int interpolationType = get<1>(GetParam());
+    int borderMode = get<2>(GetParam());
+    int type = get<3>(GetParam());
+    unsigned int height = size.height;
+    unsigned int width = size.width;
+    Mat source(height, width, type);
+    Mat destination;
+    Mat map_x(height, width, CV_32FC2);
+    Mat map_y;
+
+    declare.in(source, WARMUP_RNG);
+
+    update_map(source, map_x, map_y, ((interpolationType & WARP_RELATIVE_MAP) != 0));
+
+    TEST_CYCLE()
    {
-        for( int i = 0; i < src.cols; i++ )
+        remap(source, destination, map_x, map_y, interpolationType, borderMode);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+void update_map(const Mat& src, Mat& map_x, Mat& map_y, bool relative )
+{
+    if (map_y.empty()) {
+        float *ptr_x = map_x.ptr<float>();
+        for (int j = 0; j < src.rows; j++) {
+            for (int i = 0; i < src.cols; i++) {
+                size_t offset = 2 * j * src.cols + 2 * i;
+                if( i > src.cols*0.25 && i < src.cols*0.75 && j > src.rows*0.25 && j < src.rows*0.75 )
+                {
+                    ptr_x[offset]   = 2*( i - src.cols*0.25f ) + 0.5f ;
+                    ptr_x[offset+1] = 2*( j - src.rows*0.25f ) + 0.5f ;
+                }
+                else
+                {
+                    ptr_x[offset]   = 0 ;
+                    ptr_x[offset+1] = 0 ;
+                }
+
+                if( relative )
+                {
+                    ptr_x[offset]   -= static_cast<float>(i) ;
+                    ptr_x[offset+1] -= static_cast<float>(j) ;
+                }
+            }
+        }
+    } else {
+        for( int j = 0; j < src.rows; j++ )
        {
-            switch( remapMode )
+            for( int i = 0; i < src.cols; i++ )
            {
-            case HALF_SIZE:
                if( i > src.cols*0.25 && i < src.cols*0.75 && j > src.rows*0.25 && j < src.rows*0.75 )
                {
                    map_x.at<float>(j,i) = 2*( i - src.cols*0.25f ) + 0.5f ;
@ -209,25 +257,12 @@ void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, boo
                    map_x.at<float>(j,i) = 0 ;
                    map_y.at<float>(j,i) = 0 ;
                }
-                break;
-            case UPSIDE_DOWN:
-                map_x.at<float>(j,i) = static_cast<float>(i) ;
-                map_y.at<float>(j,i) = static_cast<float>(src.rows - j) ;
-                break;
-            case REFLECTION_X:
-                map_x.at<float>(j,i) = static_cast<float>(src.cols - i) ;
-                map_y.at<float>(j,i) = static_cast<float>(j) ;
-                break;
-            case REFLECTION_BOTH:
-                map_x.at<float>(j,i) = static_cast<float>(src.cols - i) ;
-                map_y.at<float>(j,i) = static_cast<float>(src.rows - j) ;
-                break;
-            } // end of switch

-            if( relative )
-            {
-                map_x.at<float>(j,i) -= static_cast<float>(i);
-                map_y.at<float>(j,i) -= static_cast<float>(j);
+                if( relative )
+                {
+                    map_x.at<float>(j,i) -= static_cast<float>(i);
+                    map_y.at<float>(j,i) -= static_cast<float>(j);
+                }
            }
        }
    }
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1634,77 +1634,13 @@ private:

 void cv::remap( InputArray _src, OutputArray _dst,
                InputArray _map1, InputArray _map2,
-                int interpolation, int borderType, const Scalar& borderValue )
+                int interpolation, int borderType, const Scalar& borderValue,
+                AlgorithmHint hint )
 {
    CV_INSTRUMENT_REGION();

-    const bool hasRelativeFlag = ((interpolation & cv::WARP_RELATIVE_MAP) != 0);
-
-    static RemapNNFunc nn_tab[2][CV_DEPTH_MAX] =
-    {
-        {
-            remapNearest<uchar, false>, remapNearest<schar, false>, remapNearest<ushort, false>, remapNearest<short, false>,
-            remapNearest<int, false>, remapNearest<float, false>, remapNearest<double, false>, 0
-        },
-        {
-            remapNearest<uchar, true>, remapNearest<schar, true>, remapNearest<ushort, true>, remapNearest<short, true>,
-            remapNearest<int, true>, remapNearest<float, true>, remapNearest<double, true>, 0
-        }
-    };
-
-    static RemapFunc linear_tab[2][CV_DEPTH_MAX] =
-    {
-        {
-            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<false>, short, false>, 0,
-            remapBilinear<Cast<float, ushort>, RemapNoVec<false>, float, false>,
-            remapBilinear<Cast<float, short>, RemapNoVec<false>, float, false>, 0,
-            remapBilinear<Cast<float, float>, RemapNoVec<false>, float, false>,
-            remapBilinear<Cast<double, double>, RemapNoVec<false>, float, false>, 0
-        },
-        {
-            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<true>, short, true>, 0,
-            remapBilinear<Cast<float, ushort>, RemapNoVec<true>, float, true>,
-            remapBilinear<Cast<float, short>, RemapNoVec<true>, float, true>, 0,
-            remapBilinear<Cast<float, float>, RemapNoVec<true>, float, true>,
-            remapBilinear<Cast<double, double>, RemapNoVec<true>, float, true>, 0
-        }
-    };
-
-    static RemapFunc cubic_tab[2][CV_DEPTH_MAX] =
-    {
-        {
-            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
-            remapBicubic<Cast<float, ushort>, float, 1, false>,
-            remapBicubic<Cast<float, short>, float, 1, false>, 0,
-            remapBicubic<Cast<float, float>, float, 1, false>,
-            remapBicubic<Cast<double, double>, float, 1, false>, 0
-        },
-        {
-            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
-            remapBicubic<Cast<float, ushort>, float, 1, true>,
-            remapBicubic<Cast<float, short>, float, 1, true>, 0,
-            remapBicubic<Cast<float, float>, float, 1, true>,
-            remapBicubic<Cast<double, double>, float, 1, true>, 0
-        }
-};
-
-    static RemapFunc lanczos4_tab[2][8] =
-    {
-        {
-            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
-            remapLanczos4<Cast<float, ushort>, float, 1, false>,
-            remapLanczos4<Cast<float, short>, float, 1, false>, 0,
-            remapLanczos4<Cast<float, float>, float, 1, false>,
-            remapLanczos4<Cast<double, double>, float, 1, false>, 0
-        },
-        {
-            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
-            remapLanczos4<Cast<float, ushort>, float, 1, true>,
-            remapLanczos4<Cast<float, short>, float, 1, true>, 0,
-            remapLanczos4<Cast<float, float>, float, 1, true>,
-            remapLanczos4<Cast<double, double>, float, 1, true>, 0
-        }
-};
+    if (hint == cv::ALGO_HINT_DEFAULT)
+        hint = cv::getDefaultAlgorithmHint();

    CV_Assert( !_map1.empty() );
    CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
@ -1728,12 +1664,78 @@ void cv::remap( InputArray _src, OutputArray _dst,
                 map1.ptr<float>(), map1.step, map2.ptr<float>(), map2.step, interpolation, borderType, borderValue.val);
    }

+    const bool hasRelativeFlag = ((interpolation & cv::WARP_RELATIVE_MAP) != 0);
+
    interpolation &= ~cv::WARP_RELATIVE_MAP;
    if( interpolation == INTER_AREA )
        interpolation = INTER_LINEAR;

    int type = src.type(), depth = CV_MAT_DEPTH(type);

+    if (interpolation == INTER_LINEAR) {
+        if (map1.depth() == CV_32F) {
+            const auto *src_data = src.ptr<const uint8_t>();
+            auto *dst_data = dst.ptr<uint8_t>();
+            size_t src_step = src.step, dst_step = dst.step,
+                   map1_step = map1.step, map2_step = map2.step;
+            int src_rows = src.rows, src_cols = src.cols;
+            int dst_rows = dst.rows, dst_cols = dst.cols;
+            const float *map1_data = map1.ptr<const float>();
+            const float *map2_data = map2.ptr<const float>();
+            switch (src.type()) {
+                case CV_8UC1: {
+                    if (hint == cv::ALGO_HINT_APPROX) {
+                        CV_CPU_DISPATCH(remapLinearApproxInvoker_8UC1, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    } else {
+                        CV_CPU_DISPATCH(remapLinearInvoker_8UC1, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    }
+                    break;
+                }
+                case CV_8UC3: {
+                    if (hint == cv::ALGO_HINT_APPROX) {
+                        CV_CPU_DISPATCH(remapLinearApproxInvoker_8UC3, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    } else {
+                        CV_CPU_DISPATCH(remapLinearInvoker_8UC3, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    }
+                    break;
+                }
+                case CV_8UC4: {
+                    if (hint == cv::ALGO_HINT_APPROX) {
+                        CV_CPU_DISPATCH(remapLinearApproxInvoker_8UC4, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    } else {
+                        CV_CPU_DISPATCH(remapLinearInvoker_8UC4, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    }
+                    break;
+                }
+                case CV_16UC1: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_16UC1, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                case CV_16UC3: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_16UC3, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                case CV_16UC4: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_16UC4, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                case CV_32FC1: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_32FC1, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                case CV_32FC3: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_32FC3, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                case CV_32FC4: {
+                    CV_CPU_DISPATCH(remapLinearInvoker_32FC4, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
+                    break;
+                }
+                // no default
+            }
+        }
+    }
+
 #if defined HAVE_IPP && !IPP_DISABLE_REMAP
    CV_IPP_CHECK()
    {
@ -1781,6 +1783,72 @@ void cv::remap( InputArray _src, OutputArray _dst,
    bool fixpt = depth == CV_8U;
    bool planar_input = false;

+    static RemapNNFunc nn_tab[2][CV_DEPTH_MAX] =
+    {
+        {
+            remapNearest<uchar, false>, remapNearest<schar, false>, remapNearest<ushort, false>, remapNearest<short, false>,
+            remapNearest<int, false>, remapNearest<float, false>, remapNearest<double, false>, 0
+        },
+        {
+            remapNearest<uchar, true>, remapNearest<schar, true>, remapNearest<ushort, true>, remapNearest<short, true>,
+            remapNearest<int, true>, remapNearest<float, true>, remapNearest<double, true>, 0
+        }
+    };
+
+    static RemapFunc linear_tab[2][CV_DEPTH_MAX] =
+    {
+        {
+            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<false>, short, false>, 0,
+            remapBilinear<Cast<float, ushort>, RemapNoVec<false>, float, false>,
+            remapBilinear<Cast<float, short>, RemapNoVec<false>, float, false>, 0,
+            remapBilinear<Cast<float, float>, RemapNoVec<false>, float, false>,
+            remapBilinear<Cast<double, double>, RemapNoVec<false>, float, false>, 0
+        },
+        {
+            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<true>, short, true>, 0,
+            remapBilinear<Cast<float, ushort>, RemapNoVec<true>, float, true>,
+            remapBilinear<Cast<float, short>, RemapNoVec<true>, float, true>, 0,
+            remapBilinear<Cast<float, float>, RemapNoVec<true>, float, true>,
+            remapBilinear<Cast<double, double>, RemapNoVec<true>, float, true>, 0
+        }
+    };
+
+    static RemapFunc cubic_tab[2][CV_DEPTH_MAX] =
+    {
+        {
+            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
+            remapBicubic<Cast<float, ushort>, float, 1, false>,
+            remapBicubic<Cast<float, short>, float, 1, false>, 0,
+            remapBicubic<Cast<float, float>, float, 1, false>,
+            remapBicubic<Cast<double, double>, float, 1, false>, 0
+        },
+        {
+            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
+            remapBicubic<Cast<float, ushort>, float, 1, true>,
+            remapBicubic<Cast<float, short>, float, 1, true>, 0,
+            remapBicubic<Cast<float, float>, float, 1, true>,
+            remapBicubic<Cast<double, double>, float, 1, true>, 0
+        }
+    };
+
+    static RemapFunc lanczos4_tab[2][8] =
+    {
+        {
+            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
+            remapLanczos4<Cast<float, ushort>, float, 1, false>,
+            remapLanczos4<Cast<float, short>, float, 1, false>, 0,
+            remapLanczos4<Cast<float, float>, float, 1, false>,
+            remapLanczos4<Cast<double, double>, float, 1, false>, 0
+        },
+        {
+            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
+            remapLanczos4<Cast<float, ushort>, float, 1, true>,
+            remapLanczos4<Cast<float, short>, float, 1, true>, 0,
+            remapLanczos4<Cast<float, float>, float, 1, true>,
+            remapLanczos4<Cast<double, double>, float, 1, true>, 0
+        }
+    };
+
    const int relativeOptionIndex = (hasRelativeFlag ? 1 : 0);
    if( interpolation == INTER_NEAREST )
    {
--- a/modules/imgproc/src/opencl/remap.cl
+++ b/modules/imgproc/src/opencl/remap.cl
@ -334,14 +334,6 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int

 #elif defined INTER_LINEAR

-__constant float coeffs[64] =
-{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
-  0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
-  0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
-  0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
-  0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
-  0.062500f, 0.937500f, 0.031250f, 0.968750f };
-
 __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                                __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                                __global const uchar * map1ptr, int map1_step, int map1_offset,
@ -422,109 +414,62 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
    if (x < dst_cols)
    {
        WT scalar = CONVERT_TO_WT(convertScalar(nVal));
-        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
        int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
        int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));

-        #pragma unroll
-        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
-            map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
-            if (y < dst_rows)
-            {
-                __global const float * map1 = (__global const float *)(map1ptr + map1_index);
-                __global const float * map2 = (__global const float *)(map2ptr + map2_index);
-                __global T * dst = (__global T *)(dstptr + dst_index);
+        for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map1_index += map1_step, map2_index += map2_step)
+        {
+            __global const float * map1 = (__global const float *)(map1ptr + map1_index);
+            __global const float * map2 = (__global const float *)(map2ptr + map2_index);

-#if defined BORDER_CONSTANT
-                float xf = map1[0], yf = map2[0];
-                int sx = (convert_int_sat_rtz(mad(xf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS);
-                int sy = (convert_int_sat_rtz(mad(yf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS);
-                #if WARP_RELATIVE
-                sx += x;
-                sy += y;
-                #endif
+            float X0 = map1[0];
+            float Y0 = map2[0];
+            #if WARP_RELATIVE
+            X0 += x;
+            Y0 += dy;
+            #endif

-                __constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
-                __constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
+            int sx = convert_int_rtn(X0);
+            int sy = convert_int_rtn(Y0);

-                WT sum = (WT)(0), xsum;
-                int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));
+            float ax = X0 - (float) sx;
+            float ay = Y0 - (float) sy;

-                #pragma unroll
-                for (int yp = 0; yp < 2; ++yp, src_index += src_step)
-                {
-                    if (sy + yp >= 0 && sy + yp < src_rows)
-                    {
-                        xsum = (WT)(0);
-                        if (sx >= 0 && sx + 2 < src_cols)
-                        {
-#if SRC_DEPTH == 0 && CN == 1
-                            uchar2 value = vload2(0, srcptr + src_index);
-                            xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
-#else
-                            #pragma unroll
-                            for (int xp = 0; xp < 2; ++xp)
-                                xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
-#endif
-                        }
-                        else
-                        {
-                            #pragma unroll
-                            for (int xp = 0; xp < 2; ++xp)
-                                xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
-                                           CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
-                        }
-                        sum = fma(xsum, coeffs_y[yp], sum);
-                    }
-                    else
-                        sum = fma(scalar, coeffs_y[yp], sum);
-                }
+            int2 map_data0 = (int2)(sx, sy);
+            int2 map_data1 = (int2)(sx+1, sy);
+            int2 map_data2 = (int2)(sx, sy+1);
+            int2 map_data3 = (int2)(sx+1, sy+1);

-                storepix(CONVERT_TO_T(sum), dst);
-#else
-                float2 map_data = (float2)(map1[0], map2[0]);
-                #if WARP_RELATIVE
-                map_data.x += x;
-                map_data.y += y;
-                #endif
-
-                int2 map_dataA = convert_int2_sat_rtn(map_data);
-                int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
-                int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
-                int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
-
-                float2 _u = map_data - convert_float2(map_dataA);
-                WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-                WT scalar = CONVERT_TO_WT(convertScalar(nVal));
-                WT a = scalar, b = scalar, c = scalar, d = scalar;
-
-                if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-                    a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataA, a);
-
-                if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-                    b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataB, b);
-
-                if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-                    c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataC, c);
-
-                if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-                    d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataD, d);
-
-                WT dst_data = a * (1 - u.x) * (1 - u.y) +
-                              b * (u.x)     * (1 - u.y) +
-                              c * (1 - u.x) * (u.y) +
-                              d * (u.x)     * (u.y);
-                storepix(CONVERT_TO_T(dst_data), dst);
-#endif
+            WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
+            if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) {
+                v0 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data0, v0);
            }
+            if (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) {
+                v2 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy+1, src_step, mad24(sx, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data2, v2);
+            }
+
+            if (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) {
+                v1 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx+1, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data1, v1);
+            }
+            if (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) {
+                v3 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy+1, src_step, mad24(sx+1, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data3, v3);
+            }
+
+            int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset));
+
+            v0 = fma(v1 - v0, ax, v0);
+            v2 = fma(v3 - v2, ax, v2);
+            v0 = fma(v2 - v0, ay, v0);
+            storepix(CONVERT_TO_T(v0), dstptr + dst_index);
+        }
    }
 }

@ -539,57 +484,61 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
    if (x < dst_cols)
    {
        WT scalar = CONVERT_TO_WT(convertScalar(nVal));
-        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
        int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));

-        #pragma unroll
-        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
-            map_index += map_step, dst_index += dst_step)
-            if (y < dst_rows)
-            {
-                __global const float2 * map = (__global const float2 *)(mapptr + map_index);
-                __global T * dst = (__global T *)(dstptr + dst_index);
+        for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map_index += map_step)
+        {
+            __global const float2 * map = (__global const float2 *)(mapptr + map_index);
+            float2 map_data = map[0];

-                float2 map_data = map[0];
-                #if WARP_RELATIVE
-                map_data.x += x;
-                map_data.y += y;
-                #endif
-                int2 map_dataA = convert_int2_sat_rtn(map_data);
-                int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
-                int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
-                int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
+            float X0 = map_data.x;
+            float Y0 = map_data.y;
+            #if WARP_RELATIVE
+            X0 += x;
+            Y0 += dy;
+            #endif

-                float2 _u = map_data - convert_float2(map_dataA);
-                WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-                WT a = scalar, b = scalar, c = scalar, d = scalar;
+            int sx = convert_int_rtn(X0);
+            int sy = convert_int_rtn(Y0);

-                if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-                    a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataA, a);
+            float ax = X0 - (float) sx;
+            float ay = Y0 - (float) sy;

-                if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-                    b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataB, b);
+            int2 map_data0 = (int2)(sx, sy);
+            int2 map_data1 = (int2)(sx+1, sy);
+            int2 map_data2 = (int2)(sx, sy+1);
+            int2 map_data3 = (int2)(sx+1, sy+1);

-                if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-                    c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataC, c);
-
-                if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-                    d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
-                else
-                    EXTRAPOLATE(map_dataD, d);
-
-                WT dst_data = a * (1 - u.x) * (1 - u.y) +
-                              b * (u.x)     * (1 - u.y) +
-                              c * (1 - u.x) * (u.y) +
-                              d * (u.x)     * (u.y);
-                storepix(CONVERT_TO_T(dst_data), dst);
+            WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
+            if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) {
+                v0 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data0, v0);
            }
+            if (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) {
+                v2 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy+1, src_step, mad24(sx, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data2, v2);
+            }
+
+            if (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) {
+                v1 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx+1, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data1, v1);
+            }
+            if (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) {
+                v3 = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(sy+1, src_step, mad24(sx+1, TSIZE, src_offset)))));
+            } else {
+                EXTRAPOLATE(map_data3, v3);
+            }
+
+            int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset));
+
+            v0 = fma(v1 - v0, ax, v0);
+            v2 = fma(v3 - v2, ax, v2);
+            v0 = fma(v2 - v0, ay, v0);
+            storepix(CONVERT_TO_T(v0), dstptr + dst_index);
+        }
    }
 }

--- a/modules/imgproc/src/warp_common.scalar.hpp
+++ b/modules/imgproc/src/warp_common.scalar.hpp
@ -3,6 +3,29 @@
 // of this distribution and at http://opencv.org/license.html.

 // Shuffle
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(cn, dtype_reg) \
+    dtype_reg p00##cn, p01##cn, p10##cn, p11##cn;
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C1(dtype_reg, dtype_ptr) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
+    const dtype_ptr *srcptr = src + srcstep * iy + ix;
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C3(dtype_reg, dtype_ptr) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
+    const dtype_ptr *srcptr = src + srcstep * iy + ix*3;
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C4(dtype_reg, dtype_ptr) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(a, dtype_reg) \
+    const dtype_ptr *srcptr = src + srcstep * iy + ix*4;
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_8U(CN) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint8_t)
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_16U(CN) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint16_t)
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_32F(CN) \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(float, float)
+
 #define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(CN, cn, i) \
    p00##CN = srcptr[i]; p01##CN = srcptr[i + cn]; \
    p10##CN = srcptr[srcstep + i]; p11##CN = srcptr[srcstep + cn + i];
@ -93,7 +116,10 @@
        pxy##a = src[glob_ofs+3]; \
    }

-#define CV_WARP_LINEAR_SCALAR_SHUFFLE(CN) \
+#define CV_WARP_LINEAR_SCALAR_SHUFFLE(CN, DEPTH) \
+    int ix = cvFloor(sx), iy = cvFloor(sy); \
+    sx -= ix; sy -= iy; \
+    CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##DEPTH(CN); \
    if ((((unsigned)ix < (unsigned)(srccols-1)) & \
        ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
        CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_##CN() \
--- a/modules/imgproc/src/warp_common.vector.hpp
+++ b/modules/imgproc/src/warp_common.vector.hpp
@ -555,3 +555,83 @@
    vst4_u8(dstptr + x*4, result);
 #define CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(CN) \
    CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8_##CN()
+
+
+// Special case for C4 load, shuffle and bilinear interpolation
+#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
+    const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs))); \
+    v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs+4))); \
+    v_float32 i##ofs##_pix2 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs+srcstep))); \
+    v_float32 i##ofs##_pix3 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs+srcstep+4))); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]);  \
+    i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
+    i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
+    i##ofs##_pix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
+#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4() \
+    for (int i = 0; i < uf; i+=vlanes_32) { \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(0); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(1); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(2); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(3); \
+        auto i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)), \
+             i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
+        v_pack_store(dstptr + 4*(x+i), i01_pix); \
+        v_pack_store(dstptr + 4*(x+i+2), i23_pix); \
+    }
+#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
+    const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \
+    const uint8_t *srcptr##ofs1 = src + addr[i+ofs1]; \
+    v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs0)), \
+            i##ofs0##_pix23 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs0+srcstep)); \
+    v_int32 i##ofs1##_pix01 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs1)), \
+            i##ofs1##_pix23 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs1+srcstep)); \
+    v_float32 i##ofs0##_fpix01 = v_cvt_f32(i##ofs0##_pix01), i##ofs0##_fpix23 = v_cvt_f32(i##ofs0##_pix23); \
+    v_float32 i##ofs1##_fpix01 = v_cvt_f32(i##ofs1##_pix01), i##ofs1##_fpix23 = v_cvt_f32(i##ofs1##_pix23); \
+    v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \
+              i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \
+    v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \
+    v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \
+    v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \
+              i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \
+              i##ofs0##_beta  = vx_setall_f32(vbeta[i+ofs0]), \
+              i##ofs1##_beta  = vx_setall_f32(vbeta[i+ofs1]); \
+    v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \
+              i##ofs0##ofs1##_beta  = v_combine_low(i##ofs0##_beta,  i##ofs1##_beta); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
+    i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta,  v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
+#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4() \
+    for (int i = 0; i < uf; i+=vlanes_32) { \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(0, 1); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(2, 3); \
+        auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
+        v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(4, 5); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(6, 7); \
+        auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
+        v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); \
+    }
+#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
+    const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs))), \
+              i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs+4))), \
+              i##ofs##_fpix2 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs+srcstep))), \
+              i##ofs##_fpix3 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs+srcstep+4))); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
+    i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
+#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4() \
+    for (int i = 0; i < uf; i+=4) { \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(0); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(1); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(2); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(3); \
+        auto i01_pix = v_pack(v_round(i0_fpix0), v_round(i1_fpix0)), \
+             i23_pix = v_pack(v_round(i2_fpix0), v_round(i3_fpix0)); \
+        v_pack_u_store<8>(dstptr + 4*(x+i), i01_pix); \
+        v_pack_u_store<8>(dstptr + 4*(x+i+2), i23_pix); \
+    }
--- a/modules/imgproc/src/warp_kernels.simd.hpp
+++ b/modules/imgproc/src/warp_kernels.simd.hpp
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@ -703,6 +703,16 @@ protected:
    virtual void run_func();
    virtual void run_reference_func();

+    template<typename T>
+    void new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
+                       const T *bval, int borderType_x, int borderType_y);
+    template<typename T>
+    void new_linear_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
+                       const T *bval, int borderType_x, int borderType_y);
+    template<typename T>
+    void new_linear_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
+                       const T *bval, int borderType_x, int borderType_y);
+
    Mat mapx, mapy;
    int borderType;
    Scalar borderValue;
@ -710,6 +720,7 @@ protected:
    remap_func funcs[2];

 private:
+    template <typename T> void new_remap(const Mat&, Mat&);
    void remap_nearest(const Mat&, Mat&);
    void remap_generic(const Mat&, Mat&);

@ -865,15 +876,189 @@ void CV_Remap_Test::prepare_test_data_for_reference_func()

 void CV_Remap_Test::run_reference_func()
 {
-    prepare_test_data_for_reference_func();
-
    if (interpolation == INTER_AREA)
        interpolation = INTER_LINEAR;

+    if (interpolation == INTER_LINEAR && mapx.depth() == CV_32F) {
+        int src_depth = src.depth(), src_channels = src.channels();
+        Mat tmp = Mat::zeros(dst.size(), dst.type());
+        if (src_depth == CV_8U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
+            new_remap<uint8_t>(src, tmp);
+            tmp.convertTo(reference_dst, reference_dst.depth());
+            return;
+        } else if (src_depth == CV_16U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
+            new_remap<uint16_t>(src, tmp);
+            tmp.convertTo(reference_dst, reference_dst.depth());
+            return;
+        } else if (src_depth == CV_32F && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
+            new_remap<float>(src, tmp);
+            tmp.convertTo(reference_dst, reference_dst.depth());
+            return;
+        }
+    }
+
+    prepare_test_data_for_reference_func();
+
    int index = interpolation == INTER_NEAREST ? 0 : 1;
    (this->*funcs[index])(src, reference_dst);
 }

+#define FETCH_PIXEL_SCALAR(cn, dy, dx) \
+    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
+        size_t ofs = dy*srcstep + dx*cn; \
+        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr[ofs+ci];} \
+    } else if (borderType == BORDER_CONSTANT) { \
+        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = bval[ci];} \
+    } else if (borderType == BORDER_TRANSPARENT) { \
+        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = dstptr[x*cn+ci];} \
+    } else { \
+        int ix_ = borderInterpolate(ix + dx, srccols, borderType_x); \
+        int iy_ = borderInterpolate(iy + dy, srcrows, borderType_y); \
+        size_t glob_ofs = iy_*srcstep + ix_*cn; \
+        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr_[glob_ofs+ci];} \
+    }
+
+#define WARPAFFINE_SHUFFLE(cn) \
+    if ((((unsigned)ix < (unsigned)(srccols-1)) & \
+        ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
+        for (int ci = 0; ci < cn; ci++) { \
+            pxy[ci] = srcptr[ci]; \
+            pxy[ci+cn] = srcptr[ci+cn]; \
+            pxy[ci+cn*2] = srcptr[srcstep+ci]; \
+            pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \
+        } \
+    } else { \
+        if ((borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) && \
+            (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
+            ((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \
+            if (borderType == BORDER_CONSTANT) { \
+                for (int ci = 0; ci < cn; ci++) { dstptr[x*cn+ci] = bval[ci]; } \
+            } \
+            return; \
+        } \
+        FETCH_PIXEL_SCALAR(cn, 0, 0); \
+        FETCH_PIXEL_SCALAR(cn, 0, 1); \
+        FETCH_PIXEL_SCALAR(cn, 1, 0); \
+        FETCH_PIXEL_SCALAR(cn, 1, 1); \
+    }
+
+template<typename T>
+static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy)
+{
+    for (int ci = 0; ci < cn; ci++) {
+        float p00 = pxy[ci];
+        float p01 = pxy[ci+cn];
+        float p10 = pxy[ci+cn*2];
+        float p11 = pxy[ci+cn*3];
+        float v0 = p00 + sx*(p01 - p00);
+        float v1 = p10 + sx*(p11 - p10);
+        v0 += sy*(v1 - v0);
+        dst[ci] = saturate_cast<T>(v0);
+    }
+}
+
+template<typename T>
+void CV_Remap_Test::new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr,
+                                  int srccols, int srcrows, size_t srcstep,
+                                  const T *bval, int borderType_x, int borderType_y)
+{
+    int ix = (int)floorf(sx), iy = (int)floorf(sy);
+    sx -= ix; sy -= iy;
+
+    T pxy[4];
+    const T *srcptr = srcptr_ + srcstep*iy + ix;
+
+    WARPAFFINE_SHUFFLE(1);
+
+    warpaffine_linear_calc(1, pxy, dstptr+x, sx, sy);
+}
+
+template<typename T>
+void CV_Remap_Test::new_linear_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr,
+                                  int srccols, int srcrows, size_t srcstep,
+                                  const T *bval, int borderType_x, int borderType_y)
+{
+    int ix = (int)floorf(sx), iy = (int)floorf(sy);
+    sx -= ix; sy -= iy;
+
+    T pxy[12];
+    const T *srcptr = srcptr_ + srcstep*iy + ix*3;
+
+    WARPAFFINE_SHUFFLE(3);
+
+    warpaffine_linear_calc(3, pxy, dstptr+x*3, sx, sy);
+}
+
+template<typename T>
+void CV_Remap_Test::new_linear_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr,
+                                  int srccols, int srcrows, size_t srcstep,
+                                  const T *bval, int borderType_x, int borderType_y)
+{
+    int ix = (int)floorf(sx), iy = (int)floorf(sy);
+    sx -= ix; sy -= iy;
+
+    T pxy[16];
+    const T *srcptr = srcptr_ + srcstep*iy + ix*4;
+
+    WARPAFFINE_SHUFFLE(4);
+
+    warpaffine_linear_calc(4, pxy, dstptr+x*4, sx, sy);
+}
+
+template <typename T>
+void CV_Remap_Test::new_remap(const Mat &_src, Mat &_dst) {
+    int src_channels = _src.channels();
+    CV_CheckTrue(_src.channels() == 1 || _src.channels() == 3 || _src.channels() == 4, "");
+    CV_CheckTrue(mapx.depth() == CV_32F, "");
+    CV_CheckTrue(mapx.channels() == 1 || mapx.channels() == 2, "");
+
+    auto *srcptr_ = _src.ptr<const T>();
+    auto *dstptr_ = _dst.ptr<T>();
+    size_t srcstep = _src.step/sizeof(T), dststep = _dst.step/sizeof(T);
+    int srccols = _src.cols, srcrows = _src.rows;
+    int dstcols = _dst.cols, dstrows = _dst.rows;
+
+    T bval[] = {
+        saturate_cast<T>(borderValue[0]),
+        saturate_cast<T>(borderValue[1]),
+        saturate_cast<T>(borderValue[2]),
+        saturate_cast<T>(borderValue[3]),
+    };
+
+    int borderType_x = borderType != BORDER_CONSTANT &&
+                       borderType != BORDER_TRANSPARENT &&
+                       srccols <= 1 ? BORDER_REPLICATE : borderType;
+    int borderType_y = borderType != BORDER_CONSTANT &&
+                       borderType != BORDER_TRANSPARENT &&
+                       srcrows <= 1 ? BORDER_REPLICATE : borderType;
+
+    const float *mapx_data = mapx.ptr<const float>(),
+                *mapy_data = mapy.ptr<const float>();
+    int mapx_channels = mapx.channels();
+    for (int y = 0; y < dstrows; y++) {
+        T* dstptr = dstptr_ + y*dststep;
+        for (int x = 0; x < dstcols; x++) {
+            float sx, sy;
+            size_t offset = y * dstcols + x;
+            if (mapx_channels == 1) {
+                sx = mapx_data[offset];
+                sy = mapy_data[offset];
+            } else { // mapx_channels == 2
+                sx = mapx_data[2*offset];
+                sy = mapx_data[2*offset+1];
+            }
+
+            if (src_channels == 3) {
+                new_linear_c3(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+            } else if (src_channels == 4) {
+                new_linear_c4(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+            } else {
+                new_linear_c1(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+            }
+        }
+    }
+}
+
 void CV_Remap_Test::remap_nearest(const Mat& _src, Mat& _dst)
 {
    CV_Assert(_src.depth() == CV_32F && _dst.type() == _src.type());
@ -1042,10 +1227,6 @@ protected:
    virtual void run_func();
    virtual void run_reference_func();

-    template<int channels, typename T>
-    void newLinear(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
-                   const T *bval, int borderType_x, int borderType_y);
-
    Mat M;
 private:
    void warpAffine(const Mat&, Mat&);
@ -1105,105 +1286,6 @@ void CV_WarpAffine_Test::run_reference_func()
    tmp.convertTo(reference_dst, reference_dst.depth());
 }

-#define FETCH_PIXEL_SCALAR(cn, dy, dx) \
-    if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
-        size_t ofs = dy*srcstep + dx*cn; \
-        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr[ofs+ci];} \
-    } else if (borderType == BORDER_CONSTANT) { \
-        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = bval[ci];} \
-    } else if (borderType == BORDER_TRANSPARENT) { \
-        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = dstptr[x*cn+ci];} \
-    } else { \
-        int ix_ = borderInterpolate(ix + dx, srccols, borderType_x); \
-        int iy_ = borderInterpolate(iy + dy, srcrows, borderType_y); \
-        size_t glob_ofs = iy_*srcstep + ix_*cn; \
-        for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr_[glob_ofs+ci];} \
-    }
-
-#define WARPAFFINE_SHUFFLE(cn) \
-    if ((((unsigned)ix < (unsigned)(srccols-1)) & \
-        ((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
-        for (int ci = 0; ci < cn; ci++) { \
-            pxy[ci] = srcptr[ci]; \
-            pxy[ci+cn] = srcptr[ci+cn]; \
-            pxy[ci+cn*2] = srcptr[srcstep+ci]; \
-            pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \
-        } \
-    } else { \
-        if ((borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) && \
-            (((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
-            ((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \
-            if (borderType == BORDER_CONSTANT) { \
-                for (int ci = 0; ci < cn; ci++) { dstptr[x*cn+ci] = bval[ci]; } \
-            } \
-            return; \
-        } \
-        FETCH_PIXEL_SCALAR(cn, 0, 0); \
-        FETCH_PIXEL_SCALAR(cn, 0, 1); \
-        FETCH_PIXEL_SCALAR(cn, 1, 0); \
-        FETCH_PIXEL_SCALAR(cn, 1, 1); \
-    }
-
-template<typename T>
-static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy)
-{
-    for (int ci = 0; ci < cn; ci++) {
-        float p00 = pxy[ci];
-        float p01 = pxy[ci+cn];
-        float p10 = pxy[ci+cn*2];
-        float p11 = pxy[ci+cn*3];
-        float v0 = p00 + sx*(p01 - p00);
-        float v1 = p10 + sx*(p11 - p10);
-        v0 += sy*(v1 - v0);
-        dst[ci] = saturate_cast<T>(v0);
-    }
-}
-template<>
-inline void warpaffine_linear_calc<float>(int cn, const float *pxy, float *dst, float sx, float sy)
-{
-    for (int ci = 0; ci < cn; ci++) {
-        float p00 = pxy[ci];
-        float p01 = pxy[ci+cn];
-        float p10 = pxy[ci+cn*2];
-        float p11 = pxy[ci+cn*3];
-        float v0 = p00 + sx*(p01 - p00);
-        float v1 = p10 + sx*(p11 - p10);
-        v0 += sy*(v1 - v0);
-        dst[ci] = v0;
-    }
-}
-
-template<int channels, typename T>
-void CV_WarpAffine_Test::newLinear(int x, float sx, float sy, const T *srcptr_, T *dstptr,
-                                   int srccols, int srcrows, size_t srcstep,
-                                   const T *bval, int borderType_x, int borderType_y)
-{
-    int ix = (int)floorf(sx), iy = (int)floorf(sy);
-    sx -= ix; sy -= iy;
-
-    T pxy[channels*4];
-    const T *srcptr = srcptr_ + srcstep*iy + ix*channels;
-
-    WARPAFFINE_SHUFFLE(channels);
-
-    warpaffine_linear_calc(channels, pxy, dstptr+x*channels, sx, sy);
-}
-template<>
-void CV_WarpAffine_Test::newLinear<3, float>(int x, float sx, float sy, const float *srcptr_, float *dstptr,
-                                          int srccols, int srcrows, size_t srcstep,
-                                          const float *bval, int borderType_x, int borderType_y)
-{
-    int ix = (int)floorf(sx), iy = (int)floorf(sy);
-    sx -= ix; sy -= iy;
-
-    float pxy[12];
-    const float *srcptr = srcptr_ + srcstep*iy + ix*3;
-
-    WARPAFFINE_SHUFFLE(3);
-
-    warpaffine_linear_calc(3, pxy, dstptr+x*3, sx, sy);
-}
-
 template<typename T>
 void CV_WarpAffine_Test::newWarpAffine(const Mat &_src, Mat &_dst, const Mat &tM)
 {
@ -1241,11 +1323,11 @@ void CV_WarpAffine_Test::newWarpAffine(const Mat &_src, Mat &_dst, const Mat &tM
            float sy = x*_M[3] + y*_M[4] + _M[5];

            if (num_channels == 3) {
-                newLinear<3>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c3(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else if (num_channels == 4) {
-                newLinear<4>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c4(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else {
-                newLinear<1>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c1(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            }
        }
    }
@ -1372,8 +1454,7 @@ void CV_WarpPerspective_Test::generate_test_data()

 void CV_WarpPerspective_Test::run_func()
 {
-    cv::warpPerspective(src, dst, M, dst.size(), interpolation, borderType, borderValue, cv::ALGO_HINT_APPROX);
-    // cv::warpPerspective(src, dst, M, dst.size(), interpolation, borderType, borderValue);
+    cv::warpPerspective(src, dst, M, dst.size(), interpolation, borderType, borderValue);
 }

 float CV_WarpPerspective_Test::get_success_error_level(int _interpolation, int _depth) const
@ -1426,11 +1507,11 @@ void CV_WarpPerspective_Test::newWarpPerspective(const Mat &_src, Mat &_dst, con
            float sy = (x*_M[3] + y*_M[4] + _M[5]) / w;

            if (num_channels == 3) {
-                newLinear<3>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c3(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else if (num_channels == 4) {
-                newLinear<4>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c4(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            } else {
-                newLinear<1>(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
+                new_linear_c1(x, sx, sy, srcptr_, dstptr, srccols, srcrows, srcstep, bval, borderType_x, borderType_y);
            }
        }
    }
--- a/modules/stitching/test/ocl/test_warpers.cpp
+++ b/modules/stitching/test/ocl/test_warpers.cpp
@ -97,7 +97,7 @@ OCL_TEST_F(SphericalWarperTest, Mat)
        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));

-        Near(1e-4);
+        Near(9.31e-4);
    }
 }

@ -118,7 +118,7 @@ OCL_TEST_F(CylindricalWarperTest, Mat)
        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));

-        Near(1e-4);
+        Near(6.5e-4);
    }
 }

@ -139,7 +139,7 @@ OCL_TEST_F(PlaneWarperTest, Mat)
        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));

-        Near(1.5e-4);
+        Near(6.6e-4);
    }
 }

@ -160,7 +160,7 @@ OCL_TEST_F(AffineWarperTest, Mat)
        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));

-        Near(1.5e-4);
+        Near(1.3e-3);
    }
 }