diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index b1d7178798..6fbaf98f96 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -1935,34 +1935,114 @@ void TFImporter::populateNet(Net dstNet) Mat indices = getTensorContent(getConstBlob(layer, value_id, 1)); CV_Assert(indices.type() == CV_32SC1); - if (indices.total() != 2 || indices.at(0) != 1 || indices.at(1) != 2) - CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation."); - - layerParams.set("pool", "ave"); - layerParams.set("global_pooling", true); - - int id = dstNet.addLayer(name, "Pooling", layerParams); - layer_id[name] = id; - - connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); - - // There are two attributes, "keepdims" and a deprecated "keep_dims". - bool keepDims = false; - if (hasLayerAttr(layer, "keepdims")) - keepDims = getLayerAttr(layer, "keepdims").b(); - else if (hasLayerAttr(layer, "keep_dims")) - keepDims = getLayerAttr(layer, "keep_dims").b(); - - if (!keepDims) + if (indices.total() == 1 && indices.at(0) == 0) { LayerParams flattenLp; std::string flattenName = name + "/flatten"; CV_Assert(layer_id.find(flattenName) == layer_id.end()); int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp); layer_id[flattenName] = flattenId; - connect(layer_id, dstNet, Pin(name), flattenId, 0); + connect(layer_id, dstNet, parsePin(layer.input(0)), flattenId, 0); + + + LayerParams reshapeLp; + std::string reshapeName = name + "/reshape"; + CV_Assert(layer_id.find(reshapeName) == layer_id.end()); + reshapeLp.set("axis", 0); + reshapeLp.set("num_axes", 1); + std::vector newShape = {1, 1, -1}; + reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], newShape.size())); + + int reshapeId = dstNet.addLayer(reshapeName, "Reshape", reshapeLp); + layer_id[reshapeName] = reshapeId; + connect(layer_id, dstNet, Pin(flattenName), reshapeId, 0); + + LayerParams avgLp; + std::string avgName = name + "/avg"; + CV_Assert(layer_id.find(avgName) == layer_id.end()); + avgLp.set("pool", "ave"); + avgLp.set("kernel_h", 3); // TODO: node.shape[0] + avgLp.set("kernel_w", 1); + int avgId = dstNet.addLayer(avgName, "Pooling", avgLp); + layer_id[avgName] = avgId; + // one input only + connect(layer_id, dstNet, Pin(reshapeName), avgId, 0); + + LayerParams reshapeLp2; + std::string reshapeName2 = name; + CV_Assert(layer_id.find(reshapeName2) == layer_id.end()); + newShape = {2, 20, 314, 253}; // TODO: remove out shapes + + reshapeLp2.set("dim", DictValue::arrayInt(&newShape[0], newShape.size())); + + int reshapeId2 = dstNet.addLayer(reshapeName2, "Reshape", reshapeLp2); + layer_id[reshapeName2] = reshapeId2; + connect(layer_id, dstNet, Pin(avgName), reshapeId2, 0); + } else { + if (indices.total() != 2 || indices.at(0) != 1 || indices.at(1) != 2) + CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation."); + + layerParams.set("pool", "ave"); + layerParams.set("global_pooling", true); + + int id = dstNet.addLayer(name, "Pooling", layerParams); + layer_id[name] = id; + + connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); + + // There are two attributes, "keepdims" and a deprecated "keep_dims". + bool keepDims = false; + if (hasLayerAttr(layer, "keepdims")) + keepDims = getLayerAttr(layer, "keepdims").b(); + else if (hasLayerAttr(layer, "keep_dims")) + keepDims = getLayerAttr(layer, "keep_dims").b(); + + if (!keepDims) + { + LayerParams flattenLp; + std::string flattenName = name + "/flatten"; + CV_Assert(layer_id.find(flattenName) == layer_id.end()); + int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp); + layer_id[flattenName] = flattenId; + connect(layer_id, dstNet, Pin(name), flattenId, 0); + } } } + else if (type == "Pack") + { + CV_Assert(hasLayerAttr(layer, "axis")); + int dim = (int)getLayerAttr(layer, "axis").i(); + if (dim != 0) + CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation."); + + CV_Assert(hasLayerAttr(layer, "N")); + int num = (int)getLayerAttr(layer, "N").i(); + CV_Assert(layer.input_size() == num); + std::string base_name = name + "/reshape_"; + std::vector reshape_names; + for (int i = 0; i < num; i++) { + std::string reshape_name = base_name + std::to_string(i); + reshape_names.push_back(reshape_name); + LayerParams reshapeLP; + reshapeLP.set("axis", dim); + reshapeLP.set("num_axes", 1); + std::vector outShape = {1, -1}; + reshapeLP.set("dim", DictValue::arrayInt(&outShape[0], outShape.size())); + int id = dstNet.addLayer(reshape_name, "Reshape", reshapeLP); + layer_id[reshape_name] = id; + connect(layer_id, dstNet, parsePin(layer.input(i)), id, 0); + } + + layerParams.set("axis", dim); + int id = dstNet.addLayer(name, "Concat", layerParams); + layer_id[name] = id; + + for (int li = 0; li < num; li++) { + Pin inp = parsePin(reshape_names[li]); + connect(layer_id, dstNet, inp, id, li); + } + + } else if (type == "ClipByValue") { // op: "ClipByValue" diff --git a/samples/dnn/human_parsing.py b/samples/dnn/human_parsing.py new file mode 100644 index 0000000000..84d0663871 --- /dev/null +++ b/samples/dnn/human_parsing.py @@ -0,0 +1,165 @@ +import cv2 as cv +import numpy as np +import argparse + + +backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, + cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE) +targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD) + +parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--input', '-i', help='Path to input image. Skip this argument to capture frames from a camera.') +parser.add_argument('--model', '-m', required=True, help='Path to pb model.') +parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, + help="Choose one of computation backends: " + "%d: automatically (by default), " + "%d: Halide language (http://halide-lang.org/), " + "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "%d: OpenCV implementation" % backends) +parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, + help='Choose one of target computation devices: ' + '%d: CPU target (by default), ' + '%d: OpenCL, ' + '%d: OpenCL fp16 (half-float precision), ' + '%d: VPU' % targets) + +# To get pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view +# For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet +# Change script evaluate_parsing_JPPNet-s2.py for human parsing +# 1. Remove preprocessing to create image_batch_origin: +# - with tf.name_scope("create_inputs"): +# ... +# Add +# - image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input') +# +# 2. Create input +# image = cv2.imread(path/to/image) +# image_rev = np.flip(image, axis=1) +# image_h, image_w = image.shape[:2] +# input = np.stack([image, image_rev], axis=0) +# +# 3. Hardcode image_h and image_w shapes to determine output shapes +# - parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, [image_h, image_w]), +# tf.image.resize_images(parsing_out1_075, [image_h, image_w]), +# tf.image.resize_images(parsing_out1_125, [image_h, image_w])]), axis=0) +# Do similarly with parsing_out2, parsing_out3 +# 4. Remove postprocessing +# - parsing_ = sess.run(raw_output, feed_dict={'input:0': input}) +# +# 5. To save model after sess.run(...) add: +# - input_graph_def = tf.get_default_graph().as_graph_def() +# - output_node = "Mean_3" +# - output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node) +# - +# - output_graph = "LIP_JPPNet.pb" +# - with tf.gfile.GFile(output_graph, "wb") as f: +# - f.write(output_graph_def.SerializeToString()) + + + +def preprocess(image_path): + """ + Create 4-dimensional blob from image and flip image + :param image_path: path to input image + """ + image = cv.imread(image_path) + image_rev = np.flip(image, axis=1) + input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434)) + return input + + +def run_net(input, model_path, backend, target): + """ + Read network and infer model + :param model_path: path to JPPNet model + """ + net = cv.dnn.readNet(model_path) + net.setPreferableBackend(backend) + net.setPreferableTarget(target) + net.setInput(input) + out = net.forward() + return out + + +def postprocess(out): + """ + Create a grayscale human segmentation + :param out: network output + """ + # LIP classes + # 0 Background + # 1 Hat + # 2 Hair + # 3 Glove + # 4 Sunglasses + # 5 UpperClothes + # 6 Dress + # 7 Coat + # 8 Socks + # 9 Pants + # 10 Jumpsuits + # 11 Scarf + # 12 Skirt + # 13 Face + # 14 LeftArm + # 15 RightArm + # 16 LeftLeg + # 17 RightLeg + # 18 LeftShoe + # 19 RightShoe + head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0) + head_output = head_output.squeeze(0) + tail_output = tail_output.squeeze(0) + tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0) + tail_list = [arr.squeeze(0) for arr in tail_list] + tail_list_rev = [tail_list[i] for i in range(14)] + tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]]) + tail_output_rev = np.stack(tail_list_rev, axis=0) + tail_output_rev = np.flip(tail_output_rev, axis=2) + raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=False) + raw_output_all = np.expand_dims(raw_output_all, axis=0) + raw_output_all = np.argmax(raw_output_all, axis=1) + raw_output_all = raw_output_all.transpose(1, 2, 0) + return raw_output_all + + +def decode_labels(gray_image): + """ + Colorize image according to labels + :param gray_image: grayscale human segmentation result + """ + height, width, _ = gray_image.shape + colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0), + (0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128), + (0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170), + (170, 255, 85), (255, 255, 0), (255, 170, 0)] + + segm = np.stack([colors[idx] for idx in gray_image.flatten()]) + segm = segm.reshape(height, width, 3).astype(np.uint8) + segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB) + return segm + + +def parse_human(image_path, model_path, backend, target): + """ + Prepare input for execution, run net and postprocess output to parse human. + :param image_path: path to input image + :param model_path: path to JPPNet model + :param backend: name of computation backend + :param target: name of computation target + """ + input = preprocess(image_path) + output = run_net(input, model_path, backend, target) + grayscale_out = postprocess(output) + segmentation = decode_labels(grayscale_out) + return segmentation + + +if __name__ == '__main__': + args, _ = parser.parse_known_args() + output = parse_human(args.input, args.model, args.backend, args.target) + winName = 'Deep learning human parsing in OpenCV' + cv.namedWindow(winName, cv.WINDOW_AUTOSIZE) + cv.imshow(winName, output) + cv.waitKey()