Add human parsing demo

2025-07-20 19:17:36 +08:00 · 2019-12-23 15:47:20 +03:00 · 2019-12-23 15:47:20 +03:00 · 6e33769e56
commit 6e33769e56
parent 5e2bcc9149
2 changed files with 265 additions and 20 deletions
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1935,34 +1935,114 @@ void TFImporter::populateNet(Net dstNet)
            Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
            CV_Assert(indices.type() == CV_32SC1);

-            if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
-                CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation.");
-
-            layerParams.set("pool", "ave");
-            layerParams.set("global_pooling", true);
-
-            int id = dstNet.addLayer(name, "Pooling", layerParams);
-            layer_id[name] = id;
-
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-
-            // There are two attributes, "keepdims" and a deprecated "keep_dims".
-            bool keepDims = false;
-            if (hasLayerAttr(layer, "keepdims"))
-                keepDims = getLayerAttr(layer, "keepdims").b();
-            else if (hasLayerAttr(layer, "keep_dims"))
-                keepDims = getLayerAttr(layer, "keep_dims").b();
-
-            if (!keepDims)
+            if (indices.total() == 1 && indices.at<int>(0) == 0)
            {
                LayerParams flattenLp;
                std::string flattenName = name + "/flatten";
                CV_Assert(layer_id.find(flattenName) == layer_id.end());
                int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
                layer_id[flattenName] = flattenId;
-                connect(layer_id, dstNet, Pin(name), flattenId, 0);
+                connect(layer_id, dstNet, parsePin(layer.input(0)), flattenId, 0);
+
+
+                LayerParams reshapeLp;
+                std::string reshapeName = name + "/reshape";
+                CV_Assert(layer_id.find(reshapeName) == layer_id.end());
+                reshapeLp.set("axis", 0);
+                reshapeLp.set("num_axes", 1);
+                std::vector<int> newShape = {1, 1, -1};
+                reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], newShape.size()));
+
+                int reshapeId = dstNet.addLayer(reshapeName, "Reshape", reshapeLp);
+                layer_id[reshapeName] = reshapeId;
+                connect(layer_id, dstNet, Pin(flattenName), reshapeId, 0);
+
+                LayerParams avgLp;
+                std::string avgName = name + "/avg";
+                CV_Assert(layer_id.find(avgName) == layer_id.end());
+                avgLp.set("pool", "ave");
+                avgLp.set("kernel_h", 3); // TODO: node.shape[0]
+                avgLp.set("kernel_w", 1);
+                int avgId = dstNet.addLayer(avgName, "Pooling", avgLp);
+                layer_id[avgName] = avgId;
+                // one input only
+                connect(layer_id, dstNet, Pin(reshapeName), avgId, 0);
+
+                LayerParams reshapeLp2;
+                std::string reshapeName2 = name;
+                CV_Assert(layer_id.find(reshapeName2) == layer_id.end());
+                newShape = {2, 20, 314, 253}; // TODO: remove out shapes
+
+                reshapeLp2.set("dim", DictValue::arrayInt<int*>(&newShape[0], newShape.size()));
+
+                int reshapeId2 = dstNet.addLayer(reshapeName2, "Reshape", reshapeLp2);
+                layer_id[reshapeName2] = reshapeId2;
+                connect(layer_id, dstNet, Pin(avgName), reshapeId2, 0);
+            } else {
+                if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
+                    CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation.");
+
+                layerParams.set("pool", "ave");
+                layerParams.set("global_pooling", true);
+
+                int id = dstNet.addLayer(name, "Pooling", layerParams);
+                layer_id[name] = id;
+
+                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+
+                // There are two attributes, "keepdims" and a deprecated "keep_dims".
+                bool keepDims = false;
+                if (hasLayerAttr(layer, "keepdims"))
+                    keepDims = getLayerAttr(layer, "keepdims").b();
+                else if (hasLayerAttr(layer, "keep_dims"))
+                    keepDims = getLayerAttr(layer, "keep_dims").b();
+
+                if (!keepDims)
+                {
+                    LayerParams flattenLp;
+                    std::string flattenName = name + "/flatten";
+                    CV_Assert(layer_id.find(flattenName) == layer_id.end());
+                    int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
+                    layer_id[flattenName] = flattenId;
+                    connect(layer_id, dstNet, Pin(name), flattenId, 0);
+                }
            }
        }
+        else if (type == "Pack")
+        {
+            CV_Assert(hasLayerAttr(layer, "axis"));
+            int dim = (int)getLayerAttr(layer, "axis").i();
+            if (dim != 0)
+                CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation.");
+
+            CV_Assert(hasLayerAttr(layer, "N"));
+            int num = (int)getLayerAttr(layer, "N").i();
+            CV_Assert(layer.input_size() == num);
+            std::string base_name = name + "/reshape_";
+            std::vector<std::string> reshape_names;
+            for (int i = 0; i < num; i++) {
+                std::string reshape_name = base_name + std::to_string(i);
+                reshape_names.push_back(reshape_name);
+                LayerParams reshapeLP;
+                reshapeLP.set("axis", dim);
+                reshapeLP.set("num_axes", 1);
+                std::vector<int> outShape = {1, -1};
+                reshapeLP.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
+                int id = dstNet.addLayer(reshape_name, "Reshape", reshapeLP);
+                layer_id[reshape_name] = id;
+                connect(layer_id, dstNet, parsePin(layer.input(i)), id, 0);
+            }
+
+            layerParams.set("axis", dim);
+            int id = dstNet.addLayer(name, "Concat", layerParams);
+            layer_id[name] = id;
+
+            for (int li = 0; li < num; li++) {
+                Pin inp = parsePin(reshape_names[li]);
+                connect(layer_id, dstNet, inp, id, li);
+            }
+
+        }
        else if (type == "ClipByValue")
        {
            // op: "ClipByValue"
--- a/samples/dnn/human_parsing.py
+++ b/samples/dnn/human_parsing.py
@ -0,0 +1,165 @@
+import cv2 as cv
+import numpy as np
+import argparse
+
+
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE,
+            cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
+
+parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--input', '-i', help='Path to input image. Skip this argument to capture frames from a camera.')
+parser.add_argument('--model', '-m', required=True, help='Path to pb model.')
+parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
+                    help="Choose one of computation backends: "
+                         "%d: automatically (by default), "
+                         "%d: Halide language (http://halide-lang.org/), "
+                         "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                         "%d: OpenCV implementation" % backends)
+parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
+                    help='Choose one of target computation devices: '
+                         '%d: CPU target (by default), '
+                         '%d: OpenCL, '
+                         '%d: OpenCL fp16 (half-float precision), '
+                         '%d: VPU' % targets)
+
+# To get pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
+# For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
+# Change script evaluate_parsing_JPPNet-s2.py for human parsing
+# 1. Remove preprocessing to create image_batch_origin:
+# - with tf.name_scope("create_inputs"):
+#     ...
+# Add
+# -    image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
+#
+# 2. Create input
+#     image = cv2.imread(path/to/image)
+#     image_rev = np.flip(image, axis=1)
+#     image_h, image_w = image.shape[:2]
+#     input = np.stack([image, image_rev], axis=0)
+#
+# 3. Hardcode image_h and image_w shapes to determine output shapes
+# -   parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, [image_h, image_w]),
+#                                        tf.image.resize_images(parsing_out1_075, [image_h, image_w]),
+#                                        tf.image.resize_images(parsing_out1_125, [image_h, image_w])]), axis=0)
+#     Do similarly with parsing_out2, parsing_out3
+# 4. Remove postprocessing
+# -    parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
+#
+# 5. To save model after sess.run(...) add:
+# -    input_graph_def = tf.get_default_graph().as_graph_def()
+# -    output_node = "Mean_3"
+# -    output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
+# -
+# -    output_graph = "LIP_JPPNet.pb"
+# -    with tf.gfile.GFile(output_graph, "wb") as f:
+# -        f.write(output_graph_def.SerializeToString())
+
+
+
+def preprocess(image_path):
+    """
+    Create 4-dimensional blob from image and flip image
+    :param image_path: path to input image
+    """
+    image = cv.imread(image_path)
+    image_rev = np.flip(image, axis=1)
+    input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
+    return input
+
+
+def run_net(input, model_path, backend, target):
+    """
+    Read network and infer model
+    :param model_path: path to JPPNet model
+    """
+    net = cv.dnn.readNet(model_path)
+    net.setPreferableBackend(backend)
+    net.setPreferableTarget(target)
+    net.setInput(input)
+    out = net.forward()
+    return out
+
+
+def postprocess(out):
+    """
+    Create a grayscale human segmentation
+    :param out: network output
+    """
+    # LIP classes
+    # 0 Background
+    # 1 Hat
+    # 2 Hair
+    # 3 Glove
+    # 4 Sunglasses
+    # 5 UpperClothes
+    # 6 Dress
+    # 7 Coat
+    # 8 Socks
+    # 9 Pants
+    # 10 Jumpsuits
+    # 11 Scarf
+    # 12 Skirt
+    # 13 Face
+    # 14 LeftArm
+    # 15 RightArm
+    # 16 LeftLeg
+    # 17 RightLeg
+    # 18 LeftShoe
+    # 19 RightShoe
+    head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
+    head_output = head_output.squeeze(0)
+    tail_output = tail_output.squeeze(0)
+    tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
+    tail_list = [arr.squeeze(0) for arr in tail_list]
+    tail_list_rev = [tail_list[i] for i in range(14)]
+    tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
+    tail_output_rev = np.stack(tail_list_rev, axis=0)
+    tail_output_rev = np.flip(tail_output_rev, axis=2)
+    raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=False)
+    raw_output_all = np.expand_dims(raw_output_all, axis=0)
+    raw_output_all = np.argmax(raw_output_all, axis=1)
+    raw_output_all = raw_output_all.transpose(1, 2, 0)
+    return raw_output_all
+
+
+def decode_labels(gray_image):
+    """
+    Colorize image according to labels
+    :param gray_image: grayscale human segmentation result
+    """
+    height, width, _ = gray_image.shape
+    colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
+              (0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
+              (0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
+              (170, 255, 85), (255, 255, 0), (255, 170, 0)]
+
+    segm = np.stack([colors[idx] for idx in gray_image.flatten()])
+    segm = segm.reshape(height, width, 3).astype(np.uint8)
+    segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
+    return segm
+
+
+def parse_human(image_path, model_path, backend, target):
+    """
+    Prepare input for execution, run net and postprocess output to parse human.
+    :param image_path: path to input image
+    :param model_path: path to JPPNet model
+    :param backend: name of computation backend
+    :param target: name of computation target
+    """
+    input = preprocess(image_path)
+    output = run_net(input, model_path, backend, target)
+    grayscale_out = postprocess(output)
+    segmentation = decode_labels(grayscale_out)
+    return segmentation
+
+
+if __name__ == '__main__':
+    args, _ = parser.parse_known_args()
+    output = parse_human(args.input, args.model, args.backend, args.target)
+    winName = 'Deep learning human parsing in OpenCV'
+    cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
+    cv.imshow(winName, output)
+    cv.waitKey()