mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #20933 from shubham-shahh:master
Improved overall readability of the code * grid_nms.cu: minor fix-ups * Update grid_stride_range.hpp * Update tf_importer.cpp
This commit is contained in:
parent
15073d63d9
commit
b594ed99b8
@ -68,7 +68,7 @@ namespace raw {
|
|||||||
* to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
|
* to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
|
||||||
* each group has many boxes and we need IOUs amongst boxes within a group.
|
* each group has many boxes and we need IOUs amongst boxes within a group.
|
||||||
*
|
*
|
||||||
* We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
|
* We arbitrarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
|
||||||
* compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
|
* compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
|
||||||
* early on and the working blocks are compute heavy.
|
* early on and the working blocks are compute heavy.
|
||||||
*/
|
*/
|
||||||
@ -92,7 +92,7 @@ namespace raw {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
|
/* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
|
||||||
* We store the `i` boxes in shared memory to allow global memory coalesing.
|
* We store the `i` boxes in shared memory to allow global memory coalescing.
|
||||||
*/
|
*/
|
||||||
using vector_type = get_vector_type_t<T, 4>;
|
using vector_type = get_vector_type_t<T, 4>;
|
||||||
__shared__ vector_type group_i_boxes[BLOCK_SIZE];
|
__shared__ vector_type group_i_boxes[BLOCK_SIZE];
|
||||||
@ -162,7 +162,7 @@ namespace raw {
|
|||||||
* this loop has been highly tuned. Please profile and verify carefully before making changes.
|
* this loop has been highly tuned. Please profile and verify carefully before making changes.
|
||||||
*/
|
*/
|
||||||
/* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
|
/* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
|
||||||
* the loop since the compiler cannot effectively unroll on its own preassumably due to presence
|
* the loop since the compiler cannot effectively unroll on its own presumably due to presence
|
||||||
* of instructions forcing warp synchronization.
|
* of instructions forcing warp synchronization.
|
||||||
*/
|
*/
|
||||||
constexpr int UNROLL_SIZE = 4;
|
constexpr int UNROLL_SIZE = 4;
|
||||||
@ -290,7 +290,7 @@ namespace raw {
|
|||||||
if (boxes == 0)
|
if (boxes == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* We have a fixed number of threads and an arbitary number of boxes. We use an array of
|
/* We have a fixed number of threads and an arbitrary number of boxes. We use an array of
|
||||||
* bits to store which boxes haven't been eliminated and which are still active. We organize
|
* bits to store which boxes haven't been eliminated and which are still active. We organize
|
||||||
* the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
|
* the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
|
||||||
* is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
|
* is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
|
||||||
@ -464,4 +464,4 @@ std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::si
|
|||||||
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
|
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
|
||||||
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
|
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
|
||||||
|
|
||||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||||
|
@ -36,7 +36,7 @@ public:
|
|||||||
__device__ bool operator!=(const iterator& other) const {
|
__device__ bool operator!=(const iterator& other) const {
|
||||||
/* NOTE HACK
|
/* NOTE HACK
|
||||||
* 'pos' can move in large steps (see operator++)
|
* 'pos' can move in large steps (see operator++)
|
||||||
* expansion of range for loop uses != as the loop conditioion
|
* expansion of range for loop uses != as the loop condition
|
||||||
* => operator!= must return false if 'pos' crosses the end
|
* => operator!= must return false if 'pos' crosses the end
|
||||||
*/
|
*/
|
||||||
return pos < other.pos;
|
return pos < other.pos;
|
||||||
|
@ -1154,7 +1154,7 @@ void TFImporter::parseExpandDims(tensorflow::GraphDef& net, const tensorflow::No
|
|||||||
// Convert OpenCV's NHC to NCH first.
|
// Convert OpenCV's NHC to NCH first.
|
||||||
if(outShapeSize == 3)
|
if(outShapeSize == 3)
|
||||||
{
|
{
|
||||||
// If axis equal to outShapeSize, that mean we expand in Channel dimmension, and do not add permuteLayer.
|
// If axis equal to outShapeSize, that mean we expand in Channel dimension, and do not add permuteLayer.
|
||||||
if(axis != outShapeSize)
|
if(axis != outShapeSize)
|
||||||
{
|
{
|
||||||
int order[] = {0, 2, 1}; // From OpenCV's NHC to NCH.
|
int order[] = {0, 2, 1}; // From OpenCV's NHC to NCH.
|
||||||
|
Loading…
Reference in New Issue
Block a user