// // this sample demonstrates parsing (segmenting) human body parts from an image using opencv's dnn, // based on https://github.com/Engineering-Course/LIP_JPPNet // // get the pretrained model from: https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0 // #include #include #include using namespace cv; static Mat parse_human(const Mat &image, const std::string &model, int backend=dnn::DNN_BACKEND_DEFAULT, int target=dnn::DNN_TARGET_CPU) { // this network expects an image and a flipped copy as input Mat flipped; flip(image, flipped, 1); std::vector batch; batch.push_back(image); batch.push_back(flipped); Mat blob = dnn::blobFromImages(batch, 1.0, Size(), Scalar(104.00698793, 116.66876762, 122.67891434)); dnn::Net net = dnn::readNet(model); net.setPreferableBackend(backend); net.setPreferableTarget(target); net.setInput(blob); Mat out = net.forward(); // expected output: [2, 20, 384, 384], (2 lists(orig, flipped) of 20 body part heatmaps 384x384) // LIP classes: // 0 Background, 1 Hat, 2 Hair, 3 Glove, 4 Sunglasses, 5 UpperClothes, 6 Dress, 7 Coat, 8 Socks, 9 Pants // 10 Jumpsuits, 11 Scarf, 12 Skirt, 13 Face, 14 LeftArm, 15 RightArm, 16 LeftLeg, 17 RightLeg, 18 LeftShoe. 19 RightShoe Vec3b colors[] = { Vec3b(0, 0, 0), Vec3b(128, 0, 0), Vec3b(255, 0, 0), Vec3b(0, 85, 0), Vec3b(170, 0, 51), Vec3b(255, 85, 0), Vec3b(0, 0, 85), Vec3b(0, 119, 221), Vec3b(85, 85, 0), Vec3b(0, 85, 85), Vec3b(85, 51, 0), Vec3b(52, 86, 128), Vec3b(0, 128, 0), Vec3b(0, 0, 255), Vec3b(51, 170, 221), Vec3b(0, 255, 255), Vec3b(85, 255, 170), Vec3b(170, 255, 85), Vec3b(255, 255, 0), Vec3b(255, 170, 0) }; Mat segm(image.size(), CV_8UC3, Scalar(0,0,0)); Mat maxval(image.size(), CV_32F, Scalar(0)); // iterate over body part heatmaps (LIP classes) for (int i=0; i(0,i)); resize(h, head, image.size()); // we have to swap the last 3 pairs in the "tail" list static int tail_order[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,14,17,16,19,18}; Mat tail, t(out.size[2], out.size[3], CV_32F, out.ptr(1,tail_order[i])); resize(t, tail, image.size()); flip(tail, tail, 1); // mix original and flipped result Mat avg = (head + tail) * 0.5; // write color if prob value > maxval Mat cmask; compare(avg, maxval, cmask, CMP_GT); segm.setTo(colors[i], cmask); // keep largest values for next iteration max(avg, maxval, maxval); } cvtColor(segm, segm, COLOR_RGB2BGR); return segm; } int main(int argc, char**argv) { std::string param_keys = "{help h | | show help screen / args}" "{image i | | person image to process }" "{model m |lip_jppnet_384.pb| network model}"; std::string backend_keys = cv::format( "{ backend | 0 | Choose one of computation backends: " "%d: automatically (by default), " "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " "%d: OpenCV implementation, " "%d: VKCOM, " "%d: CUDA }", cv::dnn::DNN_BACKEND_DEFAULT, cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_BACKEND_VKCOM, cv::dnn::DNN_BACKEND_CUDA); std::string target_keys = cv::format( "{ target | 0 | Choose one of target computation devices: " "%d: CPU target (by default), " "%d: OpenCL, " "%d: OpenCL fp16 (half-float precision), " "%d: VPU, " "%d: Vulkan, " "%d: CUDA, " "%d: CUDA fp16 (half-float preprocess) }", cv::dnn::DNN_TARGET_CPU, cv::dnn::DNN_TARGET_OPENCL, cv::dnn::DNN_TARGET_OPENCL_FP16, cv::dnn::DNN_TARGET_MYRIAD, cv::dnn::DNN_TARGET_VULKAN, cv::dnn::DNN_TARGET_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16); std::string keys = param_keys + backend_keys + target_keys; CommandLineParser parser(argc, argv, keys); if (argc == 1 || parser.has("help")) { parser.printMessage(); return 0; } std::string model = parser.get("model"); std::string image = parser.get("image"); int backend = parser.get("backend"); int target = parser.get("target"); Mat input = imread(image); Mat segm = parse_human(input, model, backend, target); imshow("human parsing", segm); waitKey(); return 0; }