mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
Optimize opencv dft by vectorizing radix2 and radix3.
This is useful for non power-of-two sizes when WITH_IPP is not an option. This shows consistent improvement over openCV benchmarks, and we measure even larger improvements on our internal workloads. For example, for 320x480, `32FC*`, we can see a ~5% improvement}, as `320=2^6*5` and `480=2^5*3*5`, so the improved radix3 version is used. `64FC*` is flat as expected, as we do not specialize the functors for `double` in this change. ``` dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, 0, false) 1.239 1.153 1.07 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, 0, true) 0.991 0.926 1.07 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_COMPLEX_OUTPUT, false) 1.367 1.281 1.07 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_COMPLEX_OUTPUT, true) 1.114 1.049 1.06 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_INVERSE, false) 1.313 1.254 1.05 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_INVERSE, true) 1.027 0.977 1.05 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 1.296 1.217 1.06 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 1.039 0.963 1.08 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_ROWS, false) 0.542 0.524 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_ROWS, true) 0.293 0.277 1.06 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_SCALE, false) 1.265 1.175 1.08 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC1, DFT_SCALE, true) 1.004 0.942 1.07 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, 0, false) 1.292 1.280 1.01 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, 0, true) 1.038 1.030 1.01 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_COMPLEX_OUTPUT, false) 1.484 1.488 1.00 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_COMPLEX_OUTPUT, true) 1.222 1.224 1.00 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_INVERSE, false) 1.380 1.355 1.02 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_INVERSE, true) 1.117 1.133 0.99 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 1.372 1.383 0.99 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 1.117 1.127 0.99 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_ROWS, false) 0.546 0.539 1.01 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_ROWS, true) 0.293 0.299 0.98 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_SCALE, false) 1.351 1.339 1.01 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 64FC1, DFT_SCALE, true) 1.099 1.092 1.01 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, 0, false) 2.235 2.123 1.05 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, 0, true) 1.843 1.727 1.07 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_COMPLEX_OUTPUT, false) 2.189 2.109 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_COMPLEX_OUTPUT, true) 1.827 1.754 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_INVERSE, false) 2.392 2.309 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_INVERSE, true) 1.951 1.865 1.05 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 2.391 2.293 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 1.954 1.882 1.04 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_ROWS, false) 0.811 0.815 0.99 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_ROWS, true) 0.426 0.437 0.98 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_SCALE, false) 2.268 2.152 1.05 dft::Size_MatType_FlagsType_NzeroRows::(320x480, 32FC2, DFT_SCALE, true) 1.893 1.788 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, 0, false) 4.546 4.395 1.03 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, 0, true) 3.616 3.426 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_COMPLEX_OUTPUT, false) 4.843 4.668 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_COMPLEX_OUTPUT, true) 3.825 3.748 1.02 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_INVERSE, false) 4.720 4.525 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_INVERSE, true) 3.743 3.601 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 4.755 4.527 1.05 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 3.744 3.586 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_ROWS, false) 1.992 2.012 0.99 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_ROWS, true) 1.048 1.048 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_SCALE, false) 4.625 4.451 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC1, DFT_SCALE, true) 3.643 3.491 1.04 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, 0, false) 4.499 4.488 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, 0, true) 3.559 3.555 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_COMPLEX_OUTPUT, false) 5.155 5.165 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_COMPLEX_OUTPUT, true) 4.103 4.101 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_INVERSE, false) 5.484 5.474 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_INVERSE, true) 4.617 4.518 1.02 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 5.547 5.509 1.01 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 4.553 4.554 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_ROWS, false) 2.067 2.018 1.02 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_ROWS, true) 1.104 1.079 1.02 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_SCALE, false) 4.665 4.619 1.01 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 64FC1, DFT_SCALE, true) 3.698 3.681 1.00 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, 0, false) 8.774 8.275 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, 0, true) 6.975 6.527 1.07 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_COMPLEX_OUTPUT, false) 8.720 8.270 1.05 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_COMPLEX_OUTPUT, true) 6.928 6.532 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_INVERSE, false) 9.272 8.862 1.05 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_INVERSE, true) 7.323 6.946 1.05 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 9.262 8.768 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 7.298 6.871 1.06 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_ROWS, false) 3.766 3.639 1.03 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_ROWS, true) 1.932 1.889 1.02 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_SCALE, false) 8.865 8.417 1.05 dft::Size_MatType_FlagsType_NzeroRows::(800x600, 32FC2, DFT_SCALE, true) 7.067 6.643 1.06 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, 0, false) 10.014 10.141 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, 0, true) 7.600 7.632 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_COMPLEX_OUTPUT, false) 11.059 11.283 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_COMPLEX_OUTPUT, true) 8.475 8.552 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_INVERSE, false) 12.678 12.789 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_INVERSE, true) 10.445 10.359 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 12.626 12.925 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 10.538 10.553 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_ROWS, false) 5.041 5.084 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_ROWS, true) 2.595 2.607 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_SCALE, false) 10.231 10.330 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC1, DFT_SCALE, true) 7.786 7.815 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, 0, false) 13.597 13.302 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, 0, true) 10.377 10.207 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_COMPLEX_OUTPUT, false) 15.940 15.545 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_COMPLEX_OUTPUT, true) 12.299 12.230 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_INVERSE, false) 15.270 15.181 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_INVERSE, true) 12.757 12.339 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 15.512 15.157 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 12.505 12.635 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_ROWS, false) 6.359 6.255 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_ROWS, true) 3.314 3.248 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_SCALE, false) 13.937 13.733 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 64FC1, DFT_SCALE, true) 10.782 10.495 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, 0, false) 18.985 18.926 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, 0, true) 14.256 14.509 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_COMPLEX_OUTPUT, false) 18.696 19.021 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_COMPLEX_OUTPUT, true) 14.290 14.429 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_INVERSE, false) 20.135 20.296 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_INVERSE, true) 15.390 15.512 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 20.121 20.354 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 15.341 15.605 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_ROWS, false) 8.932 9.084 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_ROWS, true) 4.539 4.649 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_SCALE, false) 19.137 19.303 0.99 dft::Size_MatType_FlagsType_NzeroRows::(1280x1024, 32FC2, DFT_SCALE, true) 14.565 14.808 0.98 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, 0, false) 22.553 21.171 1.07 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, 0, true) 17.850 16.390 1.09 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_COMPLEX_OUTPUT, false) 24.062 22.634 1.06 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_COMPLEX_OUTPUT, true) 19.342 17.932 1.08 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_INVERSE, false) 28.609 27.326 1.05 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_INVERSE, true) 24.591 23.289 1.06 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 28.667 27.467 1.04 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 24.671 23.309 1.06 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_ROWS, false) 9.458 9.077 1.04 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_ROWS, true) 4.709 4.566 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_SCALE, false) 22.791 21.583 1.06 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC1, DFT_SCALE, true) 18.029 16.691 1.08 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, 0, false) 25.238 24.427 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, 0, true) 19.636 19.270 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_COMPLEX_OUTPUT, false) 28.342 27.957 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_COMPLEX_OUTPUT, true) 22.413 22.477 1.00 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_INVERSE, false) 26.465 26.085 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_INVERSE, true) 21.972 21.704 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 26.497 26.127 1.01 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 22.010 21.523 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_ROWS, false) 11.188 10.774 1.04 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_ROWS, true) 6.094 5.916 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_SCALE, false) 25.728 24.934 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 64FC1, DFT_SCALE, true) 20.077 19.653 1.02 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, 0, false) 43.834 40.726 1.08 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, 0, true) 35.198 32.218 1.09 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_COMPLEX_OUTPUT, false) 43.743 40.897 1.07 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_COMPLEX_OUTPUT, true) 35.240 32.226 1.09 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_INVERSE, false) 46.022 42.612 1.08 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_INVERSE, true) 36.779 33.961 1.08 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 46.396 42.723 1.09 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 37.025 33.874 1.09 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_ROWS, false) 17.334 16.832 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_ROWS, true) 9.212 8.970 1.03 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_SCALE, false) 44.190 41.211 1.07 dft::Size_MatType_FlagsType_NzeroRows::(1920x1080, 32FC2, DFT_SCALE, true) 35.900 32.888 1.09 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, 0, false) 40.948 38.256 1.07 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, 0, true) 33.825 30.759 1.10 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_COMPLEX_OUTPUT, false) 53.210 53.584 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_COMPLEX_OUTPUT, true) 46.356 46.712 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_INVERSE, false) 47.471 47.213 1.01 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_INVERSE, true) 40.491 41.363 0.98 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 46.724 47.049 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 40.834 41.381 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_ROWS, false) 14.508 14.490 1.00 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_ROWS, true) 7.832 7.828 1.00 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_SCALE, false) 41.491 38.341 1.08 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC1, DFT_SCALE, true) 34.587 31.208 1.11 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, 0, false) 65.155 63.173 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, 0, true) 56.091 54.752 1.02 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_COMPLEX_OUTPUT, false) 71.549 70.626 1.01 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_COMPLEX_OUTPUT, true) 62.319 61.437 1.01 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_INVERSE, false) 61.480 59.540 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_INVERSE, true) 54.047 52.650 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 61.752 61.366 1.01 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 54.400 53.665 1.01 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_ROWS, false) 20.219 19.704 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_ROWS, true) 11.145 10.868 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_SCALE, false) 66.220 64.525 1.03 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 64FC1, DFT_SCALE, true) 57.389 56.114 1.02 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, 0, false) 86.761 88.128 0.98 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, 0, true) 75.528 76.725 0.98 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_COMPLEX_OUTPUT, false) 86.750 88.223 0.98 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_COMPLEX_OUTPUT, true) 75.830 76.809 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_INVERSE, false) 91.728 92.161 1.00 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_INVERSE, true) 78.797 79.876 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, false) 92.163 92.177 1.00 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_INVERSE|DFT_COMPLEX_OUTPUT, true) 78.957 79.863 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_ROWS, false) 24.781 25.576 0.97 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_ROWS, true) 13.226 13.695 0.97 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_SCALE, false) 87.990 89.324 0.99 dft::Size_MatType_FlagsType_NzeroRows::(2048x2048, 32FC2, DFT_SCALE, true) 76.732 77.869 0.99 ```
This commit is contained in:
parent
68f527267b
commit
da555a2c9b
@ -122,6 +122,33 @@ static const double DFTTab[][2] =
|
|||||||
{ 1.00000000000000000, 0.00000000292583616 }
|
{ 1.00000000000000000, 0.00000000292583616 }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T>
|
||||||
|
struct Constants {
|
||||||
|
static const T sin_120;
|
||||||
|
static const T fft5_2;
|
||||||
|
static const T fft5_3;
|
||||||
|
static const T fft5_4;
|
||||||
|
static const T fft5_5;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T Constants<T>::sin_120 = (T)0.86602540378443864676372317075294;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T Constants<T>::fft5_2 = (T)0.559016994374947424102293417182819;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T Constants<T>::fft5_3 = (T)-0.951056516295153572116439333379382;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T Constants<T>::fft5_4 = (T)-1.538841768587626701285145288018455;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T Constants<T>::fft5_5 = (T)0.363271264002680442947733378740309;
|
||||||
|
|
||||||
|
} //namespace
|
||||||
|
|
||||||
#define BitRev(i,shift) \
|
#define BitRev(i,shift) \
|
||||||
((int)((((unsigned)bitrevTab[(i)&255] << 24)+ \
|
((int)((((unsigned)bitrevTab[(i)&255] << 24)+ \
|
||||||
((unsigned)bitrevTab[((i)>> 8)&255] << 16)+ \
|
((unsigned)bitrevTab[((i)>> 8)&255] << 16)+ \
|
||||||
@ -372,6 +399,149 @@ DFTInit( int n0, int nf, const int* factors, int* itab, int elem_size, void* _wa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reference radix-2 implementation.
|
||||||
|
template<typename T> struct DFT_R2
|
||||||
|
{
|
||||||
|
void operator()(Complex<T>* dst, const int c_n, const int n, const int dw0, const Complex<T>* wave) const {
|
||||||
|
const int nx = n/2;
|
||||||
|
for(int i = 0 ; i < c_n; i += n)
|
||||||
|
{
|
||||||
|
Complex<T>* v = dst + i;
|
||||||
|
T r0 = v[0].re + v[nx].re;
|
||||||
|
T i0 = v[0].im + v[nx].im;
|
||||||
|
T r1 = v[0].re - v[nx].re;
|
||||||
|
T i1 = v[0].im - v[nx].im;
|
||||||
|
v[0].re = r0; v[0].im = i0;
|
||||||
|
v[nx].re = r1; v[nx].im = i1;
|
||||||
|
|
||||||
|
for( int j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
||||||
|
{
|
||||||
|
v = dst + i + j;
|
||||||
|
r1 = v[nx].re*wave[dw].re - v[nx].im*wave[dw].im;
|
||||||
|
i1 = v[nx].im*wave[dw].re + v[nx].re*wave[dw].im;
|
||||||
|
r0 = v[0].re; i0 = v[0].im;
|
||||||
|
|
||||||
|
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
||||||
|
v[nx].re = r0 - r1; v[nx].im = i0 - i1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Reference radix-3 implementation.
|
||||||
|
template<typename T> struct DFT_R3
|
||||||
|
{
|
||||||
|
void operator()(Complex<T>* dst, const int c_n, const int n, const int dw0, const Complex<T>* wave) const {
|
||||||
|
const int nx = n / 3;
|
||||||
|
for(int i = 0; i < c_n; i += n )
|
||||||
|
{
|
||||||
|
{
|
||||||
|
Complex<T>* v = dst + i;
|
||||||
|
T r1 = v[nx].re + v[nx*2].re;
|
||||||
|
T i1 = v[nx].im + v[nx*2].im;
|
||||||
|
T r0 = v[0].re;
|
||||||
|
T i0 = v[0].im;
|
||||||
|
T r2 = Constants<T>::sin_120*(v[nx].im - v[nx*2].im);
|
||||||
|
T i2 = Constants<T>::sin_120*(v[nx*2].re - v[nx].re);
|
||||||
|
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
||||||
|
r0 -= (T)0.5*r1; i0 -= (T)0.5*i1;
|
||||||
|
v[nx].re = r0 + r2; v[nx].im = i0 + i2;
|
||||||
|
v[nx*2].re = r0 - r2; v[nx*2].im = i0 - i2;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
||||||
|
{
|
||||||
|
Complex<T>* v = dst + i + j;
|
||||||
|
T r0 = v[nx].re*wave[dw].re - v[nx].im*wave[dw].im;
|
||||||
|
T i0 = v[nx].re*wave[dw].im + v[nx].im*wave[dw].re;
|
||||||
|
T i2 = v[nx*2].re*wave[dw*2].re - v[nx*2].im*wave[dw*2].im;
|
||||||
|
T r2 = v[nx*2].re*wave[dw*2].im + v[nx*2].im*wave[dw*2].re;
|
||||||
|
T r1 = r0 + i2; T i1 = i0 + r2;
|
||||||
|
|
||||||
|
r2 = Constants<T>::sin_120*(i0 - r2); i2 = Constants<T>::sin_120*(i2 - r0);
|
||||||
|
r0 = v[0].re; i0 = v[0].im;
|
||||||
|
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
||||||
|
r0 -= (T)0.5*r1; i0 -= (T)0.5*i1;
|
||||||
|
v[nx].re = r0 + r2; v[nx].im = i0 + i2;
|
||||||
|
v[nx*2].re = r0 - r2; v[nx*2].im = i0 - i2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Reference radix-5 implementation.
|
||||||
|
template<typename T> struct DFT_R5
|
||||||
|
{
|
||||||
|
void operator()(Complex<T>* dst, const int c_n, const int n, const int dw0, const Complex<T>* wave) const {
|
||||||
|
const int nx = n / 5;
|
||||||
|
for(int i = 0; i < c_n; i += n )
|
||||||
|
{
|
||||||
|
for(int j = 0, dw = 0; j < nx; j++, dw += dw0 )
|
||||||
|
{
|
||||||
|
Complex<T>* v0 = dst + i + j;
|
||||||
|
Complex<T>* v1 = v0 + nx*2;
|
||||||
|
Complex<T>* v2 = v1 + nx*2;
|
||||||
|
|
||||||
|
T r0, i0, r1, i1, r2, i2, r3, i3, r4, i4, r5, i5;
|
||||||
|
|
||||||
|
r3 = v0[nx].re*wave[dw].re - v0[nx].im*wave[dw].im;
|
||||||
|
i3 = v0[nx].re*wave[dw].im + v0[nx].im*wave[dw].re;
|
||||||
|
r2 = v2[0].re*wave[dw*4].re - v2[0].im*wave[dw*4].im;
|
||||||
|
i2 = v2[0].re*wave[dw*4].im + v2[0].im*wave[dw*4].re;
|
||||||
|
|
||||||
|
r1 = r3 + r2; i1 = i3 + i2;
|
||||||
|
r3 -= r2; i3 -= i2;
|
||||||
|
|
||||||
|
r4 = v1[nx].re*wave[dw*3].re - v1[nx].im*wave[dw*3].im;
|
||||||
|
i4 = v1[nx].re*wave[dw*3].im + v1[nx].im*wave[dw*3].re;
|
||||||
|
r0 = v1[0].re*wave[dw*2].re - v1[0].im*wave[dw*2].im;
|
||||||
|
i0 = v1[0].re*wave[dw*2].im + v1[0].im*wave[dw*2].re;
|
||||||
|
|
||||||
|
r2 = r4 + r0; i2 = i4 + i0;
|
||||||
|
r4 -= r0; i4 -= i0;
|
||||||
|
|
||||||
|
r0 = v0[0].re; i0 = v0[0].im;
|
||||||
|
r5 = r1 + r2; i5 = i1 + i2;
|
||||||
|
|
||||||
|
v0[0].re = r0 + r5; v0[0].im = i0 + i5;
|
||||||
|
|
||||||
|
r0 -= (T)0.25*r5; i0 -= (T)0.25*i5;
|
||||||
|
r1 = Constants<T>::fft5_2*(r1 - r2); i1 = Constants<T>::fft5_2*(i1 - i2);
|
||||||
|
r2 = -Constants<T>::fft5_3*(i3 + i4); i2 = Constants<T>::fft5_3*(r3 + r4);
|
||||||
|
|
||||||
|
i3 *= -Constants<T>::fft5_5; r3 *= Constants<T>::fft5_5;
|
||||||
|
i4 *= -Constants<T>::fft5_4; r4 *= Constants<T>::fft5_4;
|
||||||
|
|
||||||
|
r5 = r2 + i3; i5 = i2 + r3;
|
||||||
|
r2 -= i4; i2 -= r4;
|
||||||
|
|
||||||
|
r3 = r0 + r1; i3 = i0 + i1;
|
||||||
|
r0 -= r1; i0 -= i1;
|
||||||
|
|
||||||
|
v0[nx].re = r3 + r2; v0[nx].im = i3 + i2;
|
||||||
|
v2[0].re = r3 - r2; v2[0].im = i3 - i2;
|
||||||
|
|
||||||
|
v1[0].re = r0 + r5; v1[0].im = i0 + i5;
|
||||||
|
v1[nx].re = r0 - r5; v1[nx].im = i0 - i5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T> struct DFT_VecR2
|
||||||
|
{
|
||||||
|
void operator()(Complex<T>* dst, const int c_n, const int n, const int dw0, const Complex<T>* wave) const {
|
||||||
|
return DFT_R2<T>()(dst, c_n, n, dw0, wave);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T> struct DFT_VecR3
|
||||||
|
{
|
||||||
|
void operator()(Complex<T>* dst, const int c_n, const int n, const int dw0, const Complex<T>* wave) const {
|
||||||
|
return DFT_R3<T>()(dst, c_n, n, dw0, wave);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename T> struct DFT_VecR4
|
template<typename T> struct DFT_VecR4
|
||||||
{
|
{
|
||||||
int operator()(Complex<T>*, int, int, int&, const Complex<T>*) const { return 1; }
|
int operator()(Complex<T>*, int, int, int&, const Complex<T>*) const { return 1; }
|
||||||
@ -379,6 +549,98 @@ template<typename T> struct DFT_VecR4
|
|||||||
|
|
||||||
#if CV_SSE3
|
#if CV_SSE3
|
||||||
|
|
||||||
|
// multiplies *a and *b:
|
||||||
|
// r_re + i*r_im = (a_re + i*a_im)*(b_re + i*b_im)
|
||||||
|
// r_re and r_im are placed respectively in bits 31:0 and 63:32 of the resulting
|
||||||
|
// vector register.
|
||||||
|
inline __m128 complexMul(const Complex<float>* const a, const Complex<float>* const b) {
|
||||||
|
const __m128 z = _mm_setzero_ps();
|
||||||
|
const __m128 neg_elem0 = _mm_set_ps(0.0f,0.0f,0.0f,-0.0f);
|
||||||
|
// v_a[31:0] is a->re and v_a[63:32] is a->im.
|
||||||
|
const __m128 v_a = _mm_loadl_pi(z, (const __m64*)a);
|
||||||
|
const __m128 v_b = _mm_loadl_pi(z, (const __m64*)b);
|
||||||
|
// x_1 = v[nx] * wave[dw].
|
||||||
|
const __m128 v_a_riri = _mm_shuffle_ps(v_a, v_a, _MM_SHUFFLE(0, 1, 0, 1));
|
||||||
|
const __m128 v_b_irri = _mm_shuffle_ps(v_b, v_b, _MM_SHUFFLE(1, 0, 0, 1));
|
||||||
|
const __m128 mul = _mm_mul_ps(v_a_riri, v_b_irri);
|
||||||
|
const __m128 xored = _mm_xor_ps(mul, neg_elem0);
|
||||||
|
return _mm_hadd_ps(xored, z);
|
||||||
|
}
|
||||||
|
|
||||||
|
// optimized radix-2 transform
|
||||||
|
template<> struct DFT_VecR2<float> {
|
||||||
|
void operator()(Complex<float>* dst, const int c_n, const int n, const int dw0, const Complex<float>* wave) const {
|
||||||
|
const __m128 z = _mm_setzero_ps();
|
||||||
|
const int nx = n/2;
|
||||||
|
for(int i = 0 ; i < c_n; i += n)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
Complex<float>* v = dst + i;
|
||||||
|
float r0 = v[0].re + v[nx].re;
|
||||||
|
float i0 = v[0].im + v[nx].im;
|
||||||
|
float r1 = v[0].re - v[nx].re;
|
||||||
|
float i1 = v[0].im - v[nx].im;
|
||||||
|
v[0].re = r0; v[0].im = i0;
|
||||||
|
v[nx].re = r1; v[nx].im = i1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for( int j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
||||||
|
{
|
||||||
|
Complex<float>* v = dst + i + j;
|
||||||
|
const __m128 x_1 = complexMul(&v[nx], &wave[dw]);
|
||||||
|
const __m128 v_0 = _mm_loadl_pi(z, (const __m64*)&v[0]);
|
||||||
|
_mm_storel_pi((__m64*)&v[0], _mm_add_ps(v_0, x_1));
|
||||||
|
_mm_storel_pi((__m64*)&v[nx], _mm_sub_ps(v_0, x_1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Optimized radix-3 implementation.
|
||||||
|
template<> struct DFT_VecR3<float> {
|
||||||
|
void operator()(Complex<float>* dst, const int c_n, const int n, const int dw0, const Complex<float>* wave) const {
|
||||||
|
const int nx = n / 3;
|
||||||
|
const __m128 z = _mm_setzero_ps();
|
||||||
|
const __m128 neg_elem1 = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
|
||||||
|
const __m128 sin_120 = _mm_set1_ps(Constants<float>::sin_120);
|
||||||
|
const __m128 one_half = _mm_set1_ps(0.5f);
|
||||||
|
for(int i = 0; i < c_n; i += n )
|
||||||
|
{
|
||||||
|
{
|
||||||
|
Complex<float>* v = dst + i;
|
||||||
|
|
||||||
|
float r1 = v[nx].re + v[nx*2].re;
|
||||||
|
float i1 = v[nx].im + v[nx*2].im;
|
||||||
|
float r0 = v[0].re;
|
||||||
|
float i0 = v[0].im;
|
||||||
|
float r2 = Constants<float>::sin_120*(v[nx].im - v[nx*2].im);
|
||||||
|
float i2 = Constants<float>::sin_120*(v[nx*2].re - v[nx].re);
|
||||||
|
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
||||||
|
r0 -= (float)0.5*r1; i0 -= (float)0.5*i1;
|
||||||
|
v[nx].re = r0 + r2; v[nx].im = i0 + i2;
|
||||||
|
v[nx*2].re = r0 - r2; v[nx*2].im = i0 - i2;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
||||||
|
{
|
||||||
|
Complex<float>* v = dst + i + j;
|
||||||
|
const __m128 x_0 = complexMul(&v[nx], &wave[dw]);
|
||||||
|
const __m128 x_2 = complexMul(&v[nx*2], &wave[dw*2]);
|
||||||
|
const __m128 x_1 = _mm_add_ps(x_0, x_2);
|
||||||
|
|
||||||
|
const __m128 v_0 = _mm_loadl_pi(z, (const __m64*)&v[0]);
|
||||||
|
_mm_storel_pi((__m64*)&v[0], _mm_add_ps(v_0, x_1));
|
||||||
|
|
||||||
|
const __m128 x_3 = _mm_mul_ps(sin_120, _mm_xor_ps(_mm_sub_ps(x_2, x_0), neg_elem1));
|
||||||
|
const __m128 x_3s = _mm_shuffle_ps(x_3, x_3, _MM_SHUFFLE(0, 1, 0, 1));
|
||||||
|
const __m128 x_4 = _mm_sub_ps(v_0, _mm_mul_ps(one_half, x_1));
|
||||||
|
_mm_storel_pi((__m64*)&v[nx], _mm_add_ps(x_4, x_3s));
|
||||||
|
_mm_storel_pi((__m64*)&v[nx*2], _mm_sub_ps(x_4, x_3s));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// optimized radix-4 transform
|
// optimized radix-4 transform
|
||||||
template<> struct DFT_VecR4<float>
|
template<> struct DFT_VecR4<float>
|
||||||
{
|
{
|
||||||
@ -573,12 +835,6 @@ struct OcvDftOptions {
|
|||||||
template<typename T> static void
|
template<typename T> static void
|
||||||
DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
|
DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
|
||||||
{
|
{
|
||||||
static const T sin_120 = (T)0.86602540378443864676372317075294;
|
|
||||||
static const T fft5_2 = (T)0.559016994374947424102293417182819;
|
|
||||||
static const T fft5_3 = (T)-0.951056516295153572116439333379382;
|
|
||||||
static const T fft5_4 = (T)-1.538841768587626701285145288018455;
|
|
||||||
static const T fft5_5 = (T)0.363271264002680442947733378740309;
|
|
||||||
|
|
||||||
const Complex<T>* wave = (Complex<T>*)c.wave;
|
const Complex<T>* wave = (Complex<T>*)c.wave;
|
||||||
const int * itab = c.itab;
|
const int * itab = c.itab;
|
||||||
|
|
||||||
@ -775,30 +1031,18 @@ DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
|
|||||||
for( ; n < c.factors[0]; )
|
for( ; n < c.factors[0]; )
|
||||||
{
|
{
|
||||||
// do the remaining radix-2 transform
|
// do the remaining radix-2 transform
|
||||||
nx = n;
|
|
||||||
n *= 2;
|
n *= 2;
|
||||||
dw0 /= 2;
|
dw0 /= 2;
|
||||||
|
|
||||||
for( i = 0; i < c.n; i += n )
|
if(c.haveSSE3)
|
||||||
{
|
{
|
||||||
Complex<T>* v = dst + i;
|
DFT_VecR2<T> vr2;
|
||||||
T r0 = v[0].re + v[nx].re;
|
vr2(dst, c.n, n, dw0, wave);
|
||||||
T i0 = v[0].im + v[nx].im;
|
}
|
||||||
T r1 = v[0].re - v[nx].re;
|
else
|
||||||
T i1 = v[0].im - v[nx].im;
|
{
|
||||||
v[0].re = r0; v[0].im = i0;
|
DFT_R2<T> vr2;
|
||||||
v[nx].re = r1; v[nx].im = i1;
|
vr2(dst, c.n, n, dw0, wave);
|
||||||
|
|
||||||
for( j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
|
||||||
{
|
|
||||||
v = dst + i + j;
|
|
||||||
r1 = v[nx].re*wave[dw].re - v[nx].im*wave[dw].im;
|
|
||||||
i1 = v[nx].im*wave[dw].re + v[nx].re*wave[dw].im;
|
|
||||||
r0 = v[0].re; i0 = v[0].im;
|
|
||||||
|
|
||||||
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
|
||||||
v[nx].re = r0 - r1; v[nx].im = i0 - i1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -813,94 +1057,21 @@ DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
|
|||||||
|
|
||||||
if( factor == 3 )
|
if( factor == 3 )
|
||||||
{
|
{
|
||||||
// radix-3
|
if(c.haveSSE3)
|
||||||
for( i = 0; i < c.n; i += n )
|
|
||||||
{
|
{
|
||||||
Complex<T>* v = dst + i;
|
DFT_VecR3<T> vr3;
|
||||||
|
vr3(dst, c.n, n, dw0, wave);
|
||||||
T r1 = v[nx].re + v[nx*2].re;
|
}
|
||||||
T i1 = v[nx].im + v[nx*2].im;
|
else
|
||||||
T r0 = v[0].re;
|
{
|
||||||
T i0 = v[0].im;
|
DFT_R3<T> vr3;
|
||||||
T r2 = sin_120*(v[nx].im - v[nx*2].im);
|
vr3(dst, c.n, n, dw0, wave);
|
||||||
T i2 = sin_120*(v[nx*2].re - v[nx].re);
|
|
||||||
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
|
||||||
r0 -= (T)0.5*r1; i0 -= (T)0.5*i1;
|
|
||||||
v[nx].re = r0 + r2; v[nx].im = i0 + i2;
|
|
||||||
v[nx*2].re = r0 - r2; v[nx*2].im = i0 - i2;
|
|
||||||
|
|
||||||
for( j = 1, dw = dw0; j < nx; j++, dw += dw0 )
|
|
||||||
{
|
|
||||||
v = dst + i + j;
|
|
||||||
r0 = v[nx].re*wave[dw].re - v[nx].im*wave[dw].im;
|
|
||||||
i0 = v[nx].re*wave[dw].im + v[nx].im*wave[dw].re;
|
|
||||||
i2 = v[nx*2].re*wave[dw*2].re - v[nx*2].im*wave[dw*2].im;
|
|
||||||
r2 = v[nx*2].re*wave[dw*2].im + v[nx*2].im*wave[dw*2].re;
|
|
||||||
r1 = r0 + i2; i1 = i0 + r2;
|
|
||||||
|
|
||||||
r2 = sin_120*(i0 - r2); i2 = sin_120*(i2 - r0);
|
|
||||||
r0 = v[0].re; i0 = v[0].im;
|
|
||||||
v[0].re = r0 + r1; v[0].im = i0 + i1;
|
|
||||||
r0 -= (T)0.5*r1; i0 -= (T)0.5*i1;
|
|
||||||
v[nx].re = r0 + r2; v[nx].im = i0 + i2;
|
|
||||||
v[nx*2].re = r0 - r2; v[nx*2].im = i0 - i2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if( factor == 5 )
|
else if( factor == 5 )
|
||||||
{
|
{
|
||||||
// radix-5
|
DFT_R5<T> vr5;
|
||||||
for( i = 0; i < c.n; i += n )
|
vr5(dst, c.n, n, dw0, wave);
|
||||||
{
|
|
||||||
for( j = 0, dw = 0; j < nx; j++, dw += dw0 )
|
|
||||||
{
|
|
||||||
Complex<T>* v0 = dst + i + j;
|
|
||||||
Complex<T>* v1 = v0 + nx*2;
|
|
||||||
Complex<T>* v2 = v1 + nx*2;
|
|
||||||
|
|
||||||
T r0, i0, r1, i1, r2, i2, r3, i3, r4, i4, r5, i5;
|
|
||||||
|
|
||||||
r3 = v0[nx].re*wave[dw].re - v0[nx].im*wave[dw].im;
|
|
||||||
i3 = v0[nx].re*wave[dw].im + v0[nx].im*wave[dw].re;
|
|
||||||
r2 = v2[0].re*wave[dw*4].re - v2[0].im*wave[dw*4].im;
|
|
||||||
i2 = v2[0].re*wave[dw*4].im + v2[0].im*wave[dw*4].re;
|
|
||||||
|
|
||||||
r1 = r3 + r2; i1 = i3 + i2;
|
|
||||||
r3 -= r2; i3 -= i2;
|
|
||||||
|
|
||||||
r4 = v1[nx].re*wave[dw*3].re - v1[nx].im*wave[dw*3].im;
|
|
||||||
i4 = v1[nx].re*wave[dw*3].im + v1[nx].im*wave[dw*3].re;
|
|
||||||
r0 = v1[0].re*wave[dw*2].re - v1[0].im*wave[dw*2].im;
|
|
||||||
i0 = v1[0].re*wave[dw*2].im + v1[0].im*wave[dw*2].re;
|
|
||||||
|
|
||||||
r2 = r4 + r0; i2 = i4 + i0;
|
|
||||||
r4 -= r0; i4 -= i0;
|
|
||||||
|
|
||||||
r0 = v0[0].re; i0 = v0[0].im;
|
|
||||||
r5 = r1 + r2; i5 = i1 + i2;
|
|
||||||
|
|
||||||
v0[0].re = r0 + r5; v0[0].im = i0 + i5;
|
|
||||||
|
|
||||||
r0 -= (T)0.25*r5; i0 -= (T)0.25*i5;
|
|
||||||
r1 = fft5_2*(r1 - r2); i1 = fft5_2*(i1 - i2);
|
|
||||||
r2 = -fft5_3*(i3 + i4); i2 = fft5_3*(r3 + r4);
|
|
||||||
|
|
||||||
i3 *= -fft5_5; r3 *= fft5_5;
|
|
||||||
i4 *= -fft5_4; r4 *= fft5_4;
|
|
||||||
|
|
||||||
r5 = r2 + i3; i5 = i2 + r3;
|
|
||||||
r2 -= i4; i2 -= r4;
|
|
||||||
|
|
||||||
r3 = r0 + r1; i3 = i0 + i1;
|
|
||||||
r0 -= r1; i0 -= i1;
|
|
||||||
|
|
||||||
v0[nx].re = r3 + r2; v0[nx].im = i3 + i2;
|
|
||||||
v2[0].re = r3 - r2; v2[0].im = i3 - i2;
|
|
||||||
|
|
||||||
v1[0].re = r0 + r5; v1[0].im = i0 + i5;
|
|
||||||
v1[nx].re = r0 - r5; v1[nx].im = i0 - i5;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user