From ce1cc352d9256786de399357875f62b797cad235 Mon Sep 17 00:00:00 2001 From: Aleksandr Rybnikov Date: Fri, 14 Jul 2017 20:47:56 +0300 Subject: [PATCH] MobileNet SSD sample --- .../dnn/src/layers/detection_output_layer.cpp | 2 +- .../data/dnn/MobileNetSSD_300x300.prototxt | 3102 +++++++++++++++++ samples/dnn/mobilenet_ssd_python.py | 87 + .../dnn/ssd_mobilenet_object_detection.cpp | 161 + 4 files changed, 3351 insertions(+), 1 deletion(-) create mode 100644 samples/data/dnn/MobileNetSSD_300x300.prototxt create mode 100644 samples/dnn/mobilenet_ssd_python.py create mode 100644 samples/dnn/ssd_mobilenet_object_detection.cpp diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp index 6da162423f..0b72326f7e 100644 --- a/modules/dnn/src/layers/detection_output_layer.cpp +++ b/modules/dnn/src/layers/detection_output_layer.cpp @@ -234,7 +234,7 @@ public: if (numKept == 0) { - CV_ErrorNoReturn(Error::StsError, "Couldn't find any detections"); + return; } int outputShape[] = {1, 1, (int)numKept, 7}; outputs[0].create(4, outputShape, CV_32F); diff --git a/samples/data/dnn/MobileNetSSD_300x300.prototxt b/samples/data/dnn/MobileNetSSD_300x300.prototxt new file mode 100644 index 0000000000..def19412b6 --- /dev/null +++ b/samples/data/dnn/MobileNetSSD_300x300.prototxt @@ -0,0 +1,3102 @@ +name: "MobileNet-SSD" +input: "data" +input_shape { + dim: 1 + dim: 3 + dim: 300 + dim: 300 +} +layer { + name: "conv0" + type: "Convolution" + bottom: "data" + top: "conv0" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv0/bn" + type: "BatchNorm" + bottom: "conv0" + top: "conv0" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv0/scale" + type: "Scale" + bottom: "conv0" + top: "conv0" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv0/relu" + type: "ReLU" + bottom: "conv0" + top: "conv0" +} +layer { + name: "conv1/dw" + type: "Convolution" + bottom: "conv0" + top: "conv1/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + group: 32 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv1/dw/bn" + type: "BatchNorm" + bottom: "conv1/dw" + top: "conv1/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv1/dw/scale" + type: "Scale" + bottom: "conv1/dw" + top: "conv1/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv1/dw/relu" + type: "ReLU" + bottom: "conv1/dw" + top: "conv1/dw" +} +layer { + name: "conv1" + type: "Convolution" + bottom: "conv1/dw" + top: "conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 64 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv1/bn" + type: "BatchNorm" + bottom: "conv1" + top: "conv1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv1/scale" + type: "Scale" + bottom: "conv1" + top: "conv1" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv1/relu" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "conv2/dw" + type: "Convolution" + bottom: "conv1" + top: "conv2/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 64 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + group: 64 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv2/dw/bn" + type: "BatchNorm" + bottom: "conv2/dw" + top: "conv2/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv2/dw/scale" + type: "Scale" + bottom: "conv2/dw" + top: "conv2/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv2/dw/relu" + type: "ReLU" + bottom: "conv2/dw" + top: "conv2/dw" +} +layer { + name: "conv2" + type: "Convolution" + bottom: "conv2/dw" + top: "conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv2/bn" + type: "BatchNorm" + bottom: "conv2" + top: "conv2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv2/scale" + type: "Scale" + bottom: "conv2" + top: "conv2" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv2/relu" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "conv3/dw" + type: "Convolution" + bottom: "conv2" + top: "conv3/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + group: 128 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv3/dw/bn" + type: "BatchNorm" + bottom: "conv3/dw" + top: "conv3/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv3/dw/scale" + type: "Scale" + bottom: "conv3/dw" + top: "conv3/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv3/dw/relu" + type: "ReLU" + bottom: "conv3/dw" + top: "conv3/dw" +} +layer { + name: "conv3" + type: "Convolution" + bottom: "conv3/dw" + top: "conv3" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv3/bn" + type: "BatchNorm" + bottom: "conv3" + top: "conv3" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv3/scale" + type: "Scale" + bottom: "conv3" + top: "conv3" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv3/relu" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4/dw" + type: "Convolution" + bottom: "conv3" + top: "conv4/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + group: 128 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv4/dw/bn" + type: "BatchNorm" + bottom: "conv4/dw" + top: "conv4/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv4/dw/scale" + type: "Scale" + bottom: "conv4/dw" + top: "conv4/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv4/dw/relu" + type: "ReLU" + bottom: "conv4/dw" + top: "conv4/dw" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv4/dw" + top: "conv4" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv4/bn" + type: "BatchNorm" + bottom: "conv4" + top: "conv4" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv4/scale" + type: "Scale" + bottom: "conv4" + top: "conv4" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv4/relu" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5/dw" + type: "Convolution" + bottom: "conv4" + top: "conv5/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + group: 256 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv5/dw/bn" + type: "BatchNorm" + bottom: "conv5/dw" + top: "conv5/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv5/dw/scale" + type: "Scale" + bottom: "conv5/dw" + top: "conv5/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv5/dw/relu" + type: "ReLU" + bottom: "conv5/dw" + top: "conv5/dw" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv5/dw" + top: "conv5" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv5/bn" + type: "BatchNorm" + bottom: "conv5" + top: "conv5" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv5/scale" + type: "Scale" + bottom: "conv5" + top: "conv5" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv5/relu" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "conv6/dw" + type: "Convolution" + bottom: "conv5" + top: "conv6/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + group: 256 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv6/dw/bn" + type: "BatchNorm" + bottom: "conv6/dw" + top: "conv6/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv6/dw/scale" + type: "Scale" + bottom: "conv6/dw" + top: "conv6/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv6/dw/relu" + type: "ReLU" + bottom: "conv6/dw" + top: "conv6/dw" +} +layer { + name: "conv6" + type: "Convolution" + bottom: "conv6/dw" + top: "conv6" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv6/bn" + type: "BatchNorm" + bottom: "conv6" + top: "conv6" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv6/scale" + type: "Scale" + bottom: "conv6" + top: "conv6" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv6/relu" + type: "ReLU" + bottom: "conv6" + top: "conv6" +} +layer { + name: "conv7/dw" + type: "Convolution" + bottom: "conv6" + top: "conv7/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv7/dw/bn" + type: "BatchNorm" + bottom: "conv7/dw" + top: "conv7/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv7/dw/scale" + type: "Scale" + bottom: "conv7/dw" + top: "conv7/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv7/dw/relu" + type: "ReLU" + bottom: "conv7/dw" + top: "conv7/dw" +} +layer { + name: "conv7" + type: "Convolution" + bottom: "conv7/dw" + top: "conv7" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv7/bn" + type: "BatchNorm" + bottom: "conv7" + top: "conv7" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv7/scale" + type: "Scale" + bottom: "conv7" + top: "conv7" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv7/relu" + type: "ReLU" + bottom: "conv7" + top: "conv7" +} +layer { + name: "conv8/dw" + type: "Convolution" + bottom: "conv7" + top: "conv8/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv8/dw/bn" + type: "BatchNorm" + bottom: "conv8/dw" + top: "conv8/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv8/dw/scale" + type: "Scale" + bottom: "conv8/dw" + top: "conv8/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv8/dw/relu" + type: "ReLU" + bottom: "conv8/dw" + top: "conv8/dw" +} +layer { + name: "conv8" + type: "Convolution" + bottom: "conv8/dw" + top: "conv8" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv8/bn" + type: "BatchNorm" + bottom: "conv8" + top: "conv8" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv8/scale" + type: "Scale" + bottom: "conv8" + top: "conv8" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv8/relu" + type: "ReLU" + bottom: "conv8" + top: "conv8" +} +layer { + name: "conv9/dw" + type: "Convolution" + bottom: "conv8" + top: "conv9/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv9/dw/bn" + type: "BatchNorm" + bottom: "conv9/dw" + top: "conv9/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv9/dw/scale" + type: "Scale" + bottom: "conv9/dw" + top: "conv9/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv9/dw/relu" + type: "ReLU" + bottom: "conv9/dw" + top: "conv9/dw" +} +layer { + name: "conv9" + type: "Convolution" + bottom: "conv9/dw" + top: "conv9" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv9/bn" + type: "BatchNorm" + bottom: "conv9" + top: "conv9" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv9/scale" + type: "Scale" + bottom: "conv9" + top: "conv9" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv9/relu" + type: "ReLU" + bottom: "conv9" + top: "conv9" +} +layer { + name: "conv10/dw" + type: "Convolution" + bottom: "conv9" + top: "conv10/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv10/dw/bn" + type: "BatchNorm" + bottom: "conv10/dw" + top: "conv10/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv10/dw/scale" + type: "Scale" + bottom: "conv10/dw" + top: "conv10/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv10/dw/relu" + type: "ReLU" + bottom: "conv10/dw" + top: "conv10/dw" +} +layer { + name: "conv10" + type: "Convolution" + bottom: "conv10/dw" + top: "conv10" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv10/bn" + type: "BatchNorm" + bottom: "conv10" + top: "conv10" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv10/scale" + type: "Scale" + bottom: "conv10" + top: "conv10" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv10/relu" + type: "ReLU" + bottom: "conv10" + top: "conv10" +} +layer { + name: "conv11/dw" + type: "Convolution" + bottom: "conv10" + top: "conv11/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv11/dw/bn" + type: "BatchNorm" + bottom: "conv11/dw" + top: "conv11/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv11/dw/scale" + type: "Scale" + bottom: "conv11/dw" + top: "conv11/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv11/dw/relu" + type: "ReLU" + bottom: "conv11/dw" + top: "conv11/dw" +} +layer { + name: "conv11" + type: "Convolution" + bottom: "conv11/dw" + top: "conv11" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv11/bn" + type: "BatchNorm" + bottom: "conv11" + top: "conv11" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv11/scale" + type: "Scale" + bottom: "conv11" + top: "conv11" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv11/relu" + type: "ReLU" + bottom: "conv11" + top: "conv11" +} +layer { + name: "conv12/dw" + type: "Convolution" + bottom: "conv11" + top: "conv12/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + group: 512 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv12/dw/bn" + type: "BatchNorm" + bottom: "conv12/dw" + top: "conv12/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv12/dw/scale" + type: "Scale" + bottom: "conv12/dw" + top: "conv12/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv12/dw/relu" + type: "ReLU" + bottom: "conv12/dw" + top: "conv12/dw" +} +layer { + name: "conv12" + type: "Convolution" + bottom: "conv12/dw" + top: "conv12" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 1024 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv12/bn" + type: "BatchNorm" + bottom: "conv12" + top: "conv12" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv12/scale" + type: "Scale" + bottom: "conv12" + top: "conv12" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv12/relu" + type: "ReLU" + bottom: "conv12" + top: "conv12" +} +layer { + name: "conv13/dw" + type: "Convolution" + bottom: "conv12" + top: "conv13/dw" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 1024 + bias_term: false + pad: 1 + kernel_size: 3 + group: 1024 + engine: CAFFE + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv13/dw/bn" + type: "BatchNorm" + bottom: "conv13/dw" + top: "conv13/dw" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv13/dw/scale" + type: "Scale" + bottom: "conv13/dw" + top: "conv13/dw" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv13/dw/relu" + type: "ReLU" + bottom: "conv13/dw" + top: "conv13/dw" +} +layer { + name: "conv13" + type: "Convolution" + bottom: "conv13/dw" + top: "conv13" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 1024 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv13/bn" + type: "BatchNorm" + bottom: "conv13" + top: "conv13" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv13/scale" + type: "Scale" + bottom: "conv13" + top: "conv13" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv13/relu" + type: "ReLU" + bottom: "conv13" + top: "conv13" +} +layer { + name: "conv14_1" + type: "Convolution" + bottom: "conv13" + top: "conv14_1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv14_1/bn" + type: "BatchNorm" + bottom: "conv14_1" + top: "conv14_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv14_1/scale" + type: "Scale" + bottom: "conv14_1" + top: "conv14_1" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv14_1/relu" + type: "ReLU" + bottom: "conv14_1" + top: "conv14_1" +} +layer { + name: "conv14_2" + type: "Convolution" + bottom: "conv14_1" + top: "conv14_2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv14_2/bn" + type: "BatchNorm" + bottom: "conv14_2" + top: "conv14_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv14_2/scale" + type: "Scale" + bottom: "conv14_2" + top: "conv14_2" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv14_2/relu" + type: "ReLU" + bottom: "conv14_2" + top: "conv14_2" +} +layer { + name: "conv15_1" + type: "Convolution" + bottom: "conv14_2" + top: "conv15_1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv15_1/bn" + type: "BatchNorm" + bottom: "conv15_1" + top: "conv15_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv15_1/scale" + type: "Scale" + bottom: "conv15_1" + top: "conv15_1" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv15_1/relu" + type: "ReLU" + bottom: "conv15_1" + top: "conv15_1" +} +layer { + name: "conv15_2" + type: "Convolution" + bottom: "conv15_1" + top: "conv15_2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv15_2/bn" + type: "BatchNorm" + bottom: "conv15_2" + top: "conv15_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv15_2/scale" + type: "Scale" + bottom: "conv15_2" + top: "conv15_2" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv15_2/relu" + type: "ReLU" + bottom: "conv15_2" + top: "conv15_2" +} +layer { + name: "conv16_1" + type: "Convolution" + bottom: "conv15_2" + top: "conv16_1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv16_1/bn" + type: "BatchNorm" + bottom: "conv16_1" + top: "conv16_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv16_1/scale" + type: "Scale" + bottom: "conv16_1" + top: "conv16_1" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv16_1/relu" + type: "ReLU" + bottom: "conv16_1" + top: "conv16_1" +} +layer { + name: "conv16_2" + type: "Convolution" + bottom: "conv16_1" + top: "conv16_2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv16_2/bn" + type: "BatchNorm" + bottom: "conv16_2" + top: "conv16_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv16_2/scale" + type: "Scale" + bottom: "conv16_2" + top: "conv16_2" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv16_2/relu" + type: "ReLU" + bottom: "conv16_2" + top: "conv16_2" +} +layer { + name: "conv17_1" + type: "Convolution" + bottom: "conv16_2" + top: "conv17_1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 64 + bias_term: false + kernel_size: 1 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv17_1/bn" + type: "BatchNorm" + bottom: "conv17_1" + top: "conv17_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv17_1/scale" + type: "Scale" + bottom: "conv17_1" + top: "conv17_1" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv17_1/relu" + type: "ReLU" + bottom: "conv17_1" + top: "conv17_1" +} +layer { + name: "conv17_2" + type: "Convolution" + bottom: "conv17_1" + top: "conv17_2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + } +} +layer { + name: "conv17_2/bn" + type: "BatchNorm" + bottom: "conv17_2" + top: "conv17_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } +} +layer { + name: "conv17_2/scale" + type: "Scale" + bottom: "conv17_2" + top: "conv17_2" + param { + lr_mult: 0.1 + decay_mult: 0.0 + } + param { + lr_mult: 0.2 + decay_mult: 0.0 + } + scale_param { + filler { + value: 1 + } + bias_term: true + bias_filler { + value: 0 + } + } +} +layer { + name: "conv17_2/relu" + type: "ReLU" + bottom: "conv17_2" + top: "conv17_2" +} +layer { + name: "conv11_mbox_loc" + type: "Convolution" + bottom: "conv11" + top: "conv11_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 12 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv11_mbox_loc_perm" + type: "Permute" + bottom: "conv11_mbox_loc" + top: "conv11_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv11_mbox_loc_flat" + type: "Flatten" + bottom: "conv11_mbox_loc_perm" + top: "conv11_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv11_mbox_conf" + type: "Convolution" + bottom: "conv11" + top: "conv11_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 63 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv11_mbox_conf_perm" + type: "Permute" + bottom: "conv11_mbox_conf" + top: "conv11_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv11_mbox_conf_flat" + type: "Flatten" + bottom: "conv11_mbox_conf_perm" + top: "conv11_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv11_mbox_priorbox" + type: "PriorBox" + bottom: "conv11" + bottom: "data" + top: "conv11_mbox_priorbox" + prior_box_param { + min_size: 60.0 + aspect_ratio: 2.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "conv13_mbox_loc" + type: "Convolution" + bottom: "conv13" + top: "conv13_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv13_mbox_loc_perm" + type: "Permute" + bottom: "conv13_mbox_loc" + top: "conv13_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv13_mbox_loc_flat" + type: "Flatten" + bottom: "conv13_mbox_loc_perm" + top: "conv13_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv13_mbox_conf" + type: "Convolution" + bottom: "conv13" + top: "conv13_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 126 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv13_mbox_conf_perm" + type: "Permute" + bottom: "conv13_mbox_conf" + top: "conv13_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv13_mbox_conf_flat" + type: "Flatten" + bottom: "conv13_mbox_conf_perm" + top: "conv13_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv13_mbox_priorbox" + type: "PriorBox" + bottom: "conv13" + bottom: "data" + top: "conv13_mbox_priorbox" + prior_box_param { + min_size: 105.0 + max_size: 150.0 + aspect_ratio: 2.0 + aspect_ratio: 3.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "conv14_2_mbox_loc" + type: "Convolution" + bottom: "conv14_2" + top: "conv14_2_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv14_2_mbox_loc_perm" + type: "Permute" + bottom: "conv14_2_mbox_loc" + top: "conv14_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv14_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv14_2_mbox_loc_perm" + top: "conv14_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv14_2_mbox_conf" + type: "Convolution" + bottom: "conv14_2" + top: "conv14_2_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 126 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv14_2_mbox_conf_perm" + type: "Permute" + bottom: "conv14_2_mbox_conf" + top: "conv14_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv14_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv14_2_mbox_conf_perm" + top: "conv14_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv14_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv14_2" + bottom: "data" + top: "conv14_2_mbox_priorbox" + prior_box_param { + min_size: 150.0 + max_size: 195.0 + aspect_ratio: 2.0 + aspect_ratio: 3.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "conv15_2_mbox_loc" + type: "Convolution" + bottom: "conv15_2" + top: "conv15_2_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv15_2_mbox_loc_perm" + type: "Permute" + bottom: "conv15_2_mbox_loc" + top: "conv15_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv15_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv15_2_mbox_loc_perm" + top: "conv15_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv15_2_mbox_conf" + type: "Convolution" + bottom: "conv15_2" + top: "conv15_2_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 126 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv15_2_mbox_conf_perm" + type: "Permute" + bottom: "conv15_2_mbox_conf" + top: "conv15_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv15_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv15_2_mbox_conf_perm" + top: "conv15_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv15_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv15_2" + bottom: "data" + top: "conv15_2_mbox_priorbox" + prior_box_param { + min_size: 195.0 + max_size: 240.0 + aspect_ratio: 2.0 + aspect_ratio: 3.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "conv16_2_mbox_loc" + type: "Convolution" + bottom: "conv16_2" + top: "conv16_2_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv16_2_mbox_loc_perm" + type: "Permute" + bottom: "conv16_2_mbox_loc" + top: "conv16_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv16_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv16_2_mbox_loc_perm" + top: "conv16_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv16_2_mbox_conf" + type: "Convolution" + bottom: "conv16_2" + top: "conv16_2_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 126 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv16_2_mbox_conf_perm" + type: "Permute" + bottom: "conv16_2_mbox_conf" + top: "conv16_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv16_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv16_2_mbox_conf_perm" + top: "conv16_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv16_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv16_2" + bottom: "data" + top: "conv16_2_mbox_priorbox" + prior_box_param { + min_size: 240.0 + max_size: 285.0 + aspect_ratio: 2.0 + aspect_ratio: 3.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "conv17_2_mbox_loc" + type: "Convolution" + bottom: "conv17_2" + top: "conv17_2_mbox_loc" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv17_2_mbox_loc_perm" + type: "Permute" + bottom: "conv17_2_mbox_loc" + top: "conv17_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv17_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv17_2_mbox_loc_perm" + top: "conv17_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv17_2_mbox_conf" + type: "Convolution" + bottom: "conv17_2" + top: "conv17_2_mbox_conf" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 0.0 + } + convolution_param { + num_output: 126 + kernel_size: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv17_2_mbox_conf_perm" + type: "Permute" + bottom: "conv17_2_mbox_conf" + top: "conv17_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv17_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv17_2_mbox_conf_perm" + top: "conv17_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv17_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv17_2" + bottom: "data" + top: "conv17_2_mbox_priorbox" + prior_box_param { + min_size: 285.0 + max_size: 300.0 + aspect_ratio: 2.0 + aspect_ratio: 3.0 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + offset: 0.5 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv11_mbox_loc_flat" + bottom: "conv13_mbox_loc_flat" + bottom: "conv14_2_mbox_loc_flat" + bottom: "conv15_2_mbox_loc_flat" + bottom: "conv16_2_mbox_loc_flat" + bottom: "conv17_2_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv11_mbox_conf_flat" + bottom: "conv13_mbox_conf_flat" + bottom: "conv14_2_mbox_conf_flat" + bottom: "conv15_2_mbox_conf_flat" + bottom: "conv16_2_mbox_conf_flat" + bottom: "conv17_2_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv11_mbox_priorbox" + bottom: "conv13_mbox_priorbox" + bottom: "conv14_2_mbox_priorbox" + bottom: "conv15_2_mbox_priorbox" + bottom: "conv16_2_mbox_priorbox" + bottom: "conv17_2_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 21 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 21 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 100 + } + code_type: CENTER_SIZE + keep_top_k: 100 + confidence_threshold: 0.25 + } +} \ No newline at end of file diff --git a/samples/dnn/mobilenet_ssd_python.py b/samples/dnn/mobilenet_ssd_python.py new file mode 100644 index 0000000000..44b5b6000a --- /dev/null +++ b/samples/dnn/mobilenet_ssd_python.py @@ -0,0 +1,87 @@ +import numpy as np +import argparse + +try: + import cv2 as cv +except ImportError: + raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, ' + 'configure environemnt variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)') + +inWidth = 300 +inHeight = 300 +WHRatio = inWidth / float(inHeight) +inScaleFactor = 0.007843 +meanVal = 127.5 + +classNames = ('background', + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--video", help="path to video file. If empty, camera's stream will be used") + parser.add_argument("--prototxt", default="MobileNetSSD_300x300.prototxt", + help="path to caffe prototxt") + parser.add_argument("-c", "--caffemodel", help="path to caffemodel file, download it here: " + "https://github.com/chuanqi305/MobileNet-SSD/blob/master/MobileNetSSD_train.caffemodel") + parser.add_argument("--thr", default=0.2, help="confidence threshold to filter out weak detections") + args = parser.parse_args() + + net = dnn.readNetFromCaffe(args.prototxt, args.caffemodel) + + if len(args.video): + cap = cv2.VideoCapture(args.video) + else: + cap = cv2.VideoCapture(0) + + while True: + # Capture frame-by-frame + ret, frame = cap.read() + blob = dnn.blobFromImage(frame, inScaleFactor, (inWidth, inHeight), meanVal) + net.setInput(blob) + detections = net.forward() + + cols = frame.shape[1] + rows = frame.shape[0] + + if cols / float(rows) > WHRatio: + cropSize = (int(rows * WHRatio), rows) + else: + cropSize = (cols, int(cols / WHRatio)) + + y1 = (rows - cropSize[1]) / 2 + y2 = y1 + cropSize[1] + x1 = (cols - cropSize[0]) / 2 + x2 = x1 + cropSize[0] + frame = frame[y1:y2, x1:x2] + + cols = frame.shape[1] + rows = frame.shape[0] + + for i in range(detections.shape[2]): + confidence = detections[0, 0, i, 2] + if confidence > args.thr: + class_id = int(detections[0, 0, i, 1]) + + xLeftBottom = int(detections[0, 0, i, 3] * cols) + yLeftBottom = int(detections[0, 0, i, 4] * rows) + xRightTop = int(detections[0, 0, i, 5] * cols) + yRightTop = int(detections[0, 0, i, 6] * rows) + + cv2.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop), + (0, 255, 0)) + label = classNames[class_id] + ": " + str(confidence) + labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + + cv2.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]), + (xLeftBottom + labelSize[0], yLeftBottom + baseLine), + (255, 255, 255), cv2.FILLED) + cv2.putText(frame, label, (xLeftBottom, yLeftBottom), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) + + cv2.imshow("detections", frame) + if cv2.waitKey(1) >= 0: + break diff --git a/samples/dnn/ssd_mobilenet_object_detection.cpp b/samples/dnn/ssd_mobilenet_object_detection.cpp new file mode 100644 index 0000000000..423b34dbcf --- /dev/null +++ b/samples/dnn/ssd_mobilenet_object_detection.cpp @@ -0,0 +1,161 @@ +#include +#include +#include +#include + +using namespace cv; +using namespace cv::dnn; + +#include +#include +#include +using namespace std; + +const size_t inWidth = 300; +const size_t inHeight = 300; +const float WHRatio = inWidth / (float)inHeight; +const float inScaleFactor = 0.007843f; +const float meanVal = 127.5; +const char* classNames[] = {"background", + "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", + "motorbike", "person", "pottedplant", + "sheep", "sofa", "train", "tvmonitor"}; + +const char* about = "This sample uses Single-Shot Detector " + "(https://arxiv.org/abs/1512.02325)" + "to detect objects on image.\n" + ".caffemodel model's file is avaliable here: " + "https://github.com/chuanqi305/MobileNet-SSD/blob/master/MobileNetSSD_train.caffemodel\n"; + +const char* params + = "{ help | false | print usage }" + "{ proto | MobileNetSSD_300x300.prototxt | model configuration }" + "{ model | | model weights }" + "{ video | | video for detection }" + "{ out | | path to output video file}" + "{ min_confidence | 0.2 | min confidence }"; + +int main(int argc, char** argv) +{ + cv::CommandLineParser parser(argc, argv, params); + + if (parser.get("help")) + { + cout << about << endl; + parser.printMessage(); + return 0; + } + + String modelConfiguration = parser.get("proto"); + String modelBinary = parser.get("model"); + + //! [Initialize network] + dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary); + //! [Initialize network] + + VideoCapture cap(parser.get("video")); + if(!cap.isOpened()) // check if we succeeded + { + cap = VideoCapture(0); + if(!cap.isOpened()) + { + cout << "Couldn't find camera" << endl; + return -1; + } + } + + Size inVideoSize = Size((int) cap.get(CV_CAP_PROP_FRAME_WIDTH), //Acquire input size + (int) cap.get(CV_CAP_PROP_FRAME_HEIGHT)); + + Size cropSize; + if (inVideoSize.width / (float)inVideoSize.height > WHRatio) + { + cropSize = Size(static_cast(inVideoSize.height * WHRatio), + inVideoSize.height); + } + else + { + cropSize = Size(inVideoSize.width, + static_cast(inVideoSize.width / WHRatio)); + } + + Rect crop(Point((inVideoSize.width - cropSize.width) / 2, + (inVideoSize.height - cropSize.height) / 2), + cropSize); + + VideoWriter outputVideo; + outputVideo.open(parser.get("out") , + static_cast(cap.get(CV_CAP_PROP_FOURCC)), + cap.get(CV_CAP_PROP_FPS), cropSize, true); + + for(;;) + { + Mat frame; + cap >> frame; // get a new frame from camera + //! [Prepare blob] + + Mat inputBlob = blobFromImage(frame, inScaleFactor, + Size(inWidth, inHeight), meanVal); //Convert Mat to batch of images + //! [Prepare blob] + + //! [Set input blob] + net.setInput(inputBlob, "data"); //set the network input + //! [Set input blob] + + TickMeter tm; + tm.start(); + //! [Make forward pass] + Mat detection = net.forward("detection_out"); //compute output + tm.stop(); + cout << "Inference time, ms: " << tm.getTimeMilli() << endl; + //! [Make forward pass] + + Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr()); + + frame = frame(crop); + + float confidenceThreshold = parser.get("min_confidence"); + for(int i = 0; i < detectionMat.rows; i++) + { + float confidence = detectionMat.at(i, 2); + + if(confidence > confidenceThreshold) + { + size_t objectClass = (size_t)(detectionMat.at(i, 1)); + + int xLeftBottom = static_cast(detectionMat.at(i, 3) * frame.cols); + int yLeftBottom = static_cast(detectionMat.at(i, 4) * frame.rows); + int xRightTop = static_cast(detectionMat.at(i, 5) * frame.cols); + int yRightTop = static_cast(detectionMat.at(i, 6) * frame.rows); + + ostringstream ss; + ss << confidence; + String conf(ss.str()); + + Rect object((int)xLeftBottom, (int)yLeftBottom, + (int)(xRightTop - xLeftBottom), + (int)(yRightTop - yLeftBottom)); + + rectangle(frame, object, Scalar(0, 255, 0)); + String label = String(classNames[objectClass]) + ": " + conf; + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height), + Size(labelSize.width, labelSize.height + baseLine)), + Scalar(255, 255, 255), CV_FILLED); + putText(frame, label, Point(xLeftBottom, yLeftBottom), + FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); + } + } + + if (outputVideo.isOpened()) + outputVideo << frame; + + imshow("detections", frame); + if (waitKey(1) >= 0) break; + } + + return 0; +} // main