Merge pull request #9516 from arrybn:ssd_face_detector

2024-11-29 05:29:54 +08:00 · 2017-08-31 11:43:04 +00:00 · 2017-08-31 11:43:04 +00:00 · f6265500fb
commit f6265500fb
parent 5c572ffa1f d991054202
8 changed files with 5705 additions and 0 deletions
--- a/samples/dnn/CMakeLists.txt
+++ b/samples/dnn/CMakeLists.txt
@ -8,6 +8,27 @@ ocv_check_dependencies(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
 if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
  project(dnn_samples)

+  # Model branch name: dnn_samples_face_detector_20170830
+  set(DNN_FACE_DETECTOR_MODEL_COMMIT "b2bfc75f6aea5b1f834ff0f0b865a7c18ff1459f")
+  set(DNN_FACE_DETECTOR_MODEL_HASH "afbb6037fd180e8d2acb3b58ca737b9e")
+
+  set(DNN_FACE_DETECTOR_MODEL_NAME "res10_300x300_ssd_iter_140000.caffemodel")
+
+  set(DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR "${CMAKE_CURRENT_LIST_DIR}/face_detector")
+
+  if(COMMAND ocv_download)
+    ocv_download(FILENAME ${DNN_FACE_DETECTOR_MODEL_NAME}
+               HASH ${DNN_FACE_DETECTOR_MODEL_HASH}
+               URL
+                 "$ENV{OPENCV_DNN_MODELS_URL}"
+                 "${OPENCV_DNN_MODELS_URL}"
+                 "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${DNN_FACE_DETECTOR_MODEL_COMMIT}/"
+               DESTINATION_DIR ${DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR}
+               ID DNN_FACE_DETECTOR
+               RELATIVE_URL
+               STATUS res)
+  endif()
+
  ocv_include_directories("${OpenCV_SOURCE_DIR}/include")
  ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})

--- a/samples/dnn/face_detector/.gitignore
+++ b/samples/dnn/face_detector/.gitignore
@ -0,0 +1 @@
+res10_300x300_ssd_iter_140000.caffemodel
--- a/samples/dnn/face_detector/deploy.prototxt
+++ b/samples/dnn/face_detector/deploy.prototxt
--- a/samples/dnn/face_detector/how_to_train_face_detector.txt
+++ b/samples/dnn/face_detector/how_to_train_face_detector.txt
@ -0,0 +1,79 @@
+This is a brief description of training process which has been used to get res10_300x300_ssd_iter_140000.caffemodel.
+The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels).
+The model was trained in Caffe framework on some huge and avaliable online dataset.
+
+1. Prepare training tools
+You need to use "ssd" branch from this repository https://github.com/weiliu89/caffe/tree/ssd . Checkout this branch and built it (see instructions in repo's README)
+
+2. Prepare training data.
+The data preparation pipeline can be represented as:
+
+(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
+
+a) Find some datasets with face bounding boxes annotation. For some reasons I can't provide links here, but you easily find them on your own. Also study the data. It may contain small or low quality faces which can spoil training process. Often there are special flags about object quality in annotation. Remove such faces from annotation (smaller when 16 along at least one side, or blurred, of highly-occluded, or something else).
+
+b) The downloaded dataset will have some format of annotation. It may be one single file for all images, or separate file for each image or something else. But to train SSD in Caffe you need to convert annotation to PASCAL VOC format.
+PASCAL VOC annoitation consist of .xml file for each image. In this xml file all face bounding boxes should be listed as:
+
+<annotation>
+  <size>
+    <width>300</width>
+    <height>300</height>
+  </size>
+  <object>
+    <name>face</name>
+    <difficult>0</difficult>
+    <bndbox>
+      <xmin>100</xmin>
+      <ymin>100</ymin>
+      <xmax>200</xmax>
+      <ymax>200</ymax>
+    </bndbox>
+  </object>
+  <object>
+    <name>face</name>
+    <difficult>0</difficult>
+    <bndbox>
+      <xmin>0</xmin>
+      <ymin>0</ymin>
+      <xmax>100</xmax>
+      <ymax>100</ymax>
+    </bndbox>
+  </object>
+</annotation>
+
+So, convert your dataset's annotation to the fourmat above.
+Also, you should create labelmap.prototxt file with the following content:
+item {
+  name: "none_of_the_above"
+  label: 0
+  display_name: "background"
+}
+item {
+  name: "face"
+  label: 1
+  display_name: "face"
+}
+
+You need this file to establish correspondence between name of class and digital label of class.
+
+For next step we also need file there all our image-annotation file names pairs are listed. This file should contain similar lines:
+images_val/0.jpg annotations_val/0.jpg.xml
+
+c) To create LMDB you need to use create_data.sh tool from caffe/data/VOC0712 Caffe's source code directory.
+This script calls create_annoset.py inside, so check out what you need to pass as script's arguments
+
+You need to prepare 2 LMDB databases: one for training images, one for validation images.
+
+3. Train your detector
+For training you need to have 3 files: train.prototxt, test.prototxt and solver.prototxt. You can find these files in the same directory as for this readme.
+Also you need to edit train.prototxt and test.prototxt to replace paths for your LMDB databases to actual databases you've crated in step 2.
+
+Now all is done for launch training process.
+Execute next lines in Terminal:
+mkdir -p snapshot
+mkdir -p log
+/path_for_caffe_build_dir/tools/caffe train -solver="solver.prototxt" -gpu 0  2>&1 | tee -a log/log.log
+
+And wait. It will take about 8 hours to finish the process.
+After it you can use your .caffemodel from snapshot/ subdirectory in resnet_face_ssd_python.py sample.
--- a/samples/dnn/face_detector/solver.prototxt
+++ b/samples/dnn/face_detector/solver.prototxt
@ -0,0 +1,28 @@
+train_net: "train.prototxt"
+test_net: "test.prototxt"
+
+test_iter: 2312
+test_interval: 5000
+test_initialization: true
+
+base_lr: 0.01
+display: 10
+lr_policy: "multistep"
+max_iter: 140000
+stepvalue: 80000
+stepvalue: 120000
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+average_loss: 500
+iter_size: 1
+type: "SGD"
+
+solver_mode: GPU
+random_seed: 0
+debug_info: false
+snapshot: 1000
+snapshot_prefix: "snapshot/res10_300x300_ssd"
+
+eval_type: "detection"
+ap_version: "11point"
--- a/samples/dnn/face_detector/test.prototxt
+++ b/samples/dnn/face_detector/test.prototxt
--- a/samples/dnn/face_detector/train.prototxt
+++ b/samples/dnn/face_detector/train.prototxt
--- a/samples/dnn/resnet_ssd_face_python.py
+++ b/samples/dnn/resnet_ssd_face_python.py
@ -0,0 +1,59 @@
+import numpy as np
+import argparse
+import os
+import sys
+sys.path.append('/home/arrybn/build/opencv/lib')
+import cv2 as cv
+try:
+    import cv2 as cv
+except ImportError:
+    raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, '
+                      'configure environemnt variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)')
+
+from cv2 import dnn
+
+inWidth = 300
+inHeight = 300
+confThreshold = 0.5
+
+prototxt = 'face_detector/deploy.prototxt'
+caffemodel = 'face_detector/res10_300x300_ssd_iter_140000.caffemodel'
+
+if __name__ == '__main__':
+    net = dnn.readNetFromCaffe(prototxt, caffemodel)
+    cap = cv.VideoCapture(0)
+    while True:
+        ret, frame = cap.read()
+        cols = frame.shape[1]
+        rows = frame.shape[0]
+
+        net.setInput(dnn.blobFromImage(cv.resize(frame, (inWidth, inHeight)),
+                                       1.0, (inWidth, inHeight), (104., 177., 123.)))
+        detections = net.forward()
+
+        perf_stats = net.getPerfProfile()
+
+        print('Inference time, ms: %.2f' % (perf_stats[0] / cv.getTickFrequency() * 1000))
+
+        for i in range(detections.shape[2]):
+            confidence = detections[0, 0, i, 2]
+            if confidence > confThreshold:
+                xLeftBottom = int(detections[0, 0, i, 3] * cols)
+                yLeftBottom = int(detections[0, 0, i, 4] * rows)
+                xRightTop = int(detections[0, 0, i, 5] * cols)
+                yRightTop = int(detections[0, 0, i, 6] * rows)
+
+                cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop),
+                             (0, 255, 0))
+                label = "face: %.4f" % confidence
+                labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+
+                cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]),
+                                    (xLeftBottom + labelSize[0], yLeftBottom + baseLine),
+                                    (255, 255, 255), cv.FILLED)
+                cv.putText(frame, label, (xLeftBottom, yLeftBottom),
+                           cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
+
+        cv.imshow("detections", frame)
+        if cv.waitKey(1) != -1:
+            break