mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Add human parsing demo
This commit is contained in:
parent
5e2bcc9149
commit
6e33769e56
@ -1935,34 +1935,114 @@ void TFImporter::populateNet(Net dstNet)
|
||||
Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
|
||||
CV_Assert(indices.type() == CV_32SC1);
|
||||
|
||||
if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation.");
|
||||
|
||||
layerParams.set("pool", "ave");
|
||||
layerParams.set("global_pooling", true);
|
||||
|
||||
int id = dstNet.addLayer(name, "Pooling", layerParams);
|
||||
layer_id[name] = id;
|
||||
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
|
||||
|
||||
// There are two attributes, "keepdims" and a deprecated "keep_dims".
|
||||
bool keepDims = false;
|
||||
if (hasLayerAttr(layer, "keepdims"))
|
||||
keepDims = getLayerAttr(layer, "keepdims").b();
|
||||
else if (hasLayerAttr(layer, "keep_dims"))
|
||||
keepDims = getLayerAttr(layer, "keep_dims").b();
|
||||
|
||||
if (!keepDims)
|
||||
if (indices.total() == 1 && indices.at<int>(0) == 0)
|
||||
{
|
||||
LayerParams flattenLp;
|
||||
std::string flattenName = name + "/flatten";
|
||||
CV_Assert(layer_id.find(flattenName) == layer_id.end());
|
||||
int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
|
||||
layer_id[flattenName] = flattenId;
|
||||
connect(layer_id, dstNet, Pin(name), flattenId, 0);
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), flattenId, 0);
|
||||
|
||||
|
||||
LayerParams reshapeLp;
|
||||
std::string reshapeName = name + "/reshape";
|
||||
CV_Assert(layer_id.find(reshapeName) == layer_id.end());
|
||||
reshapeLp.set("axis", 0);
|
||||
reshapeLp.set("num_axes", 1);
|
||||
std::vector<int> newShape = {1, 1, -1};
|
||||
reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], newShape.size()));
|
||||
|
||||
int reshapeId = dstNet.addLayer(reshapeName, "Reshape", reshapeLp);
|
||||
layer_id[reshapeName] = reshapeId;
|
||||
connect(layer_id, dstNet, Pin(flattenName), reshapeId, 0);
|
||||
|
||||
LayerParams avgLp;
|
||||
std::string avgName = name + "/avg";
|
||||
CV_Assert(layer_id.find(avgName) == layer_id.end());
|
||||
avgLp.set("pool", "ave");
|
||||
avgLp.set("kernel_h", 3); // TODO: node.shape[0]
|
||||
avgLp.set("kernel_w", 1);
|
||||
int avgId = dstNet.addLayer(avgName, "Pooling", avgLp);
|
||||
layer_id[avgName] = avgId;
|
||||
// one input only
|
||||
connect(layer_id, dstNet, Pin(reshapeName), avgId, 0);
|
||||
|
||||
LayerParams reshapeLp2;
|
||||
std::string reshapeName2 = name;
|
||||
CV_Assert(layer_id.find(reshapeName2) == layer_id.end());
|
||||
newShape = {2, 20, 314, 253}; // TODO: remove out shapes
|
||||
|
||||
reshapeLp2.set("dim", DictValue::arrayInt<int*>(&newShape[0], newShape.size()));
|
||||
|
||||
int reshapeId2 = dstNet.addLayer(reshapeName2, "Reshape", reshapeLp2);
|
||||
layer_id[reshapeName2] = reshapeId2;
|
||||
connect(layer_id, dstNet, Pin(avgName), reshapeId2, 0);
|
||||
} else {
|
||||
if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation.");
|
||||
|
||||
layerParams.set("pool", "ave");
|
||||
layerParams.set("global_pooling", true);
|
||||
|
||||
int id = dstNet.addLayer(name, "Pooling", layerParams);
|
||||
layer_id[name] = id;
|
||||
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
|
||||
|
||||
// There are two attributes, "keepdims" and a deprecated "keep_dims".
|
||||
bool keepDims = false;
|
||||
if (hasLayerAttr(layer, "keepdims"))
|
||||
keepDims = getLayerAttr(layer, "keepdims").b();
|
||||
else if (hasLayerAttr(layer, "keep_dims"))
|
||||
keepDims = getLayerAttr(layer, "keep_dims").b();
|
||||
|
||||
if (!keepDims)
|
||||
{
|
||||
LayerParams flattenLp;
|
||||
std::string flattenName = name + "/flatten";
|
||||
CV_Assert(layer_id.find(flattenName) == layer_id.end());
|
||||
int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
|
||||
layer_id[flattenName] = flattenId;
|
||||
connect(layer_id, dstNet, Pin(name), flattenId, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (type == "Pack")
|
||||
{
|
||||
CV_Assert(hasLayerAttr(layer, "axis"));
|
||||
int dim = (int)getLayerAttr(layer, "axis").i();
|
||||
if (dim != 0)
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation.");
|
||||
|
||||
CV_Assert(hasLayerAttr(layer, "N"));
|
||||
int num = (int)getLayerAttr(layer, "N").i();
|
||||
CV_Assert(layer.input_size() == num);
|
||||
std::string base_name = name + "/reshape_";
|
||||
std::vector<std::string> reshape_names;
|
||||
for (int i = 0; i < num; i++) {
|
||||
std::string reshape_name = base_name + std::to_string(i);
|
||||
reshape_names.push_back(reshape_name);
|
||||
LayerParams reshapeLP;
|
||||
reshapeLP.set("axis", dim);
|
||||
reshapeLP.set("num_axes", 1);
|
||||
std::vector<int> outShape = {1, -1};
|
||||
reshapeLP.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
|
||||
int id = dstNet.addLayer(reshape_name, "Reshape", reshapeLP);
|
||||
layer_id[reshape_name] = id;
|
||||
connect(layer_id, dstNet, parsePin(layer.input(i)), id, 0);
|
||||
}
|
||||
|
||||
layerParams.set("axis", dim);
|
||||
int id = dstNet.addLayer(name, "Concat", layerParams);
|
||||
layer_id[name] = id;
|
||||
|
||||
for (int li = 0; li < num; li++) {
|
||||
Pin inp = parsePin(reshape_names[li]);
|
||||
connect(layer_id, dstNet, inp, id, li);
|
||||
}
|
||||
|
||||
}
|
||||
else if (type == "ClipByValue")
|
||||
{
|
||||
// op: "ClipByValue"
|
||||
|
165
samples/dnn/human_parsing.py
Normal file
165
samples/dnn/human_parsing.py
Normal file
@ -0,0 +1,165 @@
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE,
|
||||
cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input', '-i', help='Path to input image. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', '-m', required=True, help='Path to pb model.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: VPU' % targets)
|
||||
|
||||
# To get pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
|
||||
# For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
|
||||
# Change script evaluate_parsing_JPPNet-s2.py for human parsing
|
||||
# 1. Remove preprocessing to create image_batch_origin:
|
||||
# - with tf.name_scope("create_inputs"):
|
||||
# ...
|
||||
# Add
|
||||
# - image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
|
||||
#
|
||||
# 2. Create input
|
||||
# image = cv2.imread(path/to/image)
|
||||
# image_rev = np.flip(image, axis=1)
|
||||
# image_h, image_w = image.shape[:2]
|
||||
# input = np.stack([image, image_rev], axis=0)
|
||||
#
|
||||
# 3. Hardcode image_h and image_w shapes to determine output shapes
|
||||
# - parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, [image_h, image_w]),
|
||||
# tf.image.resize_images(parsing_out1_075, [image_h, image_w]),
|
||||
# tf.image.resize_images(parsing_out1_125, [image_h, image_w])]), axis=0)
|
||||
# Do similarly with parsing_out2, parsing_out3
|
||||
# 4. Remove postprocessing
|
||||
# - parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
|
||||
#
|
||||
# 5. To save model after sess.run(...) add:
|
||||
# - input_graph_def = tf.get_default_graph().as_graph_def()
|
||||
# - output_node = "Mean_3"
|
||||
# - output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
|
||||
# -
|
||||
# - output_graph = "LIP_JPPNet.pb"
|
||||
# - with tf.gfile.GFile(output_graph, "wb") as f:
|
||||
# - f.write(output_graph_def.SerializeToString())
|
||||
|
||||
|
||||
|
||||
def preprocess(image_path):
|
||||
"""
|
||||
Create 4-dimensional blob from image and flip image
|
||||
:param image_path: path to input image
|
||||
"""
|
||||
image = cv.imread(image_path)
|
||||
image_rev = np.flip(image, axis=1)
|
||||
input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
|
||||
return input
|
||||
|
||||
|
||||
def run_net(input, model_path, backend, target):
|
||||
"""
|
||||
Read network and infer model
|
||||
:param model_path: path to JPPNet model
|
||||
"""
|
||||
net = cv.dnn.readNet(model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(input)
|
||||
out = net.forward()
|
||||
return out
|
||||
|
||||
|
||||
def postprocess(out):
|
||||
"""
|
||||
Create a grayscale human segmentation
|
||||
:param out: network output
|
||||
"""
|
||||
# LIP classes
|
||||
# 0 Background
|
||||
# 1 Hat
|
||||
# 2 Hair
|
||||
# 3 Glove
|
||||
# 4 Sunglasses
|
||||
# 5 UpperClothes
|
||||
# 6 Dress
|
||||
# 7 Coat
|
||||
# 8 Socks
|
||||
# 9 Pants
|
||||
# 10 Jumpsuits
|
||||
# 11 Scarf
|
||||
# 12 Skirt
|
||||
# 13 Face
|
||||
# 14 LeftArm
|
||||
# 15 RightArm
|
||||
# 16 LeftLeg
|
||||
# 17 RightLeg
|
||||
# 18 LeftShoe
|
||||
# 19 RightShoe
|
||||
head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
|
||||
head_output = head_output.squeeze(0)
|
||||
tail_output = tail_output.squeeze(0)
|
||||
tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
|
||||
tail_list = [arr.squeeze(0) for arr in tail_list]
|
||||
tail_list_rev = [tail_list[i] for i in range(14)]
|
||||
tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
|
||||
tail_output_rev = np.stack(tail_list_rev, axis=0)
|
||||
tail_output_rev = np.flip(tail_output_rev, axis=2)
|
||||
raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=False)
|
||||
raw_output_all = np.expand_dims(raw_output_all, axis=0)
|
||||
raw_output_all = np.argmax(raw_output_all, axis=1)
|
||||
raw_output_all = raw_output_all.transpose(1, 2, 0)
|
||||
return raw_output_all
|
||||
|
||||
|
||||
def decode_labels(gray_image):
|
||||
"""
|
||||
Colorize image according to labels
|
||||
:param gray_image: grayscale human segmentation result
|
||||
"""
|
||||
height, width, _ = gray_image.shape
|
||||
colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
|
||||
(0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
|
||||
(0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
|
||||
(170, 255, 85), (255, 255, 0), (255, 170, 0)]
|
||||
|
||||
segm = np.stack([colors[idx] for idx in gray_image.flatten()])
|
||||
segm = segm.reshape(height, width, 3).astype(np.uint8)
|
||||
segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
|
||||
return segm
|
||||
|
||||
|
||||
def parse_human(image_path, model_path, backend, target):
|
||||
"""
|
||||
Prepare input for execution, run net and postprocess output to parse human.
|
||||
:param image_path: path to input image
|
||||
:param model_path: path to JPPNet model
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
input = preprocess(image_path)
|
||||
output = run_net(input, model_path, backend, target)
|
||||
grayscale_out = postprocess(output)
|
||||
segmentation = decode_labels(grayscale_out)
|
||||
return segmentation
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args, _ = parser.parse_known_args()
|
||||
output = parse_human(args.input, args.model, args.backend, args.target)
|
||||
winName = 'Deep learning human parsing in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(winName, output)
|
||||
cv.waitKey()
|
Loading…
Reference in New Issue
Block a user