Layers for fast-neural-style models: https://github.com/jcjohnson/fast-neural-style

2025-07-29 00:33:40 +08:00 · 2017-10-12 18:29:17 +03:00 · 2017-10-12 18:29:17 +03:00 · 4b52b8df34
commit 4b52b8df34
parent 60cbc46da1
7 changed files with 218 additions and 26 deletions
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -377,6 +377,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     *                 starting from the first one. The rest of dimensions won't
     *                 be padded.
     * @param value Value to be padded. Defaults to zero.
+     * @param type Padding type: 'constant', 'reflect'
     * @param input_dims Torch's parameter. If @p input_dims is not equal to the
     *                   actual input dimensionality then the `[0]th` dimension
     *                   is considered as a batch dimension and @p paddings are shifted
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -112,16 +112,12 @@ static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const
 static inline Mat getPlane(const Mat &m, int n, int cn)
 {
    CV_Assert(m.dims > 2);
-    Range range[CV_MAX_DIM];
    int sz[CV_MAX_DIM];
    for(int i = 2; i < m.dims; i++)
    {
        sz[i-2] = m.size.p[i];
-        range[i] = Range::all();
    }
-    range[0] = Range(n, n+1);
-    range[1] = Range(cn, cn+1);
-    return m(range).reshape(1, m.dims-2, sz);
+    return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr<float>(n, cn));
 }

 static inline MatShape shape(const int* dims, const int n = 4)
@ -191,6 +187,14 @@ inline int clamp(int ax, const MatShape& shape)
    return clamp(ax, (int)shape.size());
 }

+inline Range clamp(const Range& r, int axisSize)
+{
+    Range clamped(std::max(r.start, 0),
+                  r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1);
+    CV_Assert(clamped.start < clamped.end, clamped.end <= axisSize);
+    return clamped;
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }
 }
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob.
 */

 #include "../precomp.hpp"
+#include "layers_common.hpp"
 #include "op_halide.hpp"
 #include <vector>

@ -26,6 +27,7 @@ public:
        setParamsFrom(params);
        paddingValue = params.get<float>("value", 0);
        inputDims = params.get<int>("input_dims", -1);
+        paddingType = params.get<String>("type", "constant");

        CV_Assert(params.has("paddings"));
        const DictValue& paddingsParam = params.get("paddings");
@ -94,9 +96,46 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

+        if (paddingType == "constant")
+        {
            outputs[0].setTo(paddingValue);
            inputs[0]->copyTo(outputs[0](dstRanges));
        }
+        else if (paddingType == "reflect")
+        {
+            CV_Assert(inputs.size() == 1);
+            CV_Assert(outputs.size() == 1);
+            CV_Assert(inputs[0]->dims == 4);
+            CV_Assert(outputs[0].dims == 4);
+
+            if (inputs[0]->size[0] != outputs[0].size[0] || inputs[0]->size[1] != outputs[0].size[1])
+                CV_Error(Error::StsNotImplemented, "Only spatial reflection padding is supported.");
+
+            const int inpHeight = inputs[0]->size[2];
+            const int inpWidth = inputs[0]->size[3];
+            const int outHeight = outputs[0].size[2];
+            const int outWidth = outputs[0].size[3];
+            const int padTop = dstRanges[2].start;
+            const int padBottom = outHeight - dstRanges[2].end;
+            const int padLeft = dstRanges[3].start;
+            const int padRight = outWidth - dstRanges[3].end;
+            CV_Assert(padTop < inpHeight, padBottom < inpHeight,
+                      padLeft < inpWidth, padRight < inpWidth);
+
+            for (size_t n = 0; n < inputs[0]->size[0]; ++n)
+            {
+                for (size_t ch = 0; ch < inputs[0]->size[1]; ++ch)
+                {
+                    copyMakeBorder(getPlane(*inputs[0], n, ch),
+                                   getPlane(outputs[0], n, ch),
+                                   padTop, padBottom, padLeft, padRight,
+                                   BORDER_REFLECT_101);
+                }
+            }
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType);
+    }

    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
@ -124,6 +163,7 @@ private:
    std::vector<Range> dstRanges;
    int inputDims;
    float paddingValue;
+    std::string paddingType;
 };

 Ptr<PaddingLayer> PaddingLayer::create(const LayerParams &params)
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -58,7 +58,7 @@ public:
        axis = params.get<int>("axis", 1);
        if (params.has("slice_point"))
        {
-            CV_Assert(!params.has("begin") && !params.has("size"));
+            CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
            const DictValue &indicesValue = params.get("slice_point");
            sliceRanges.resize(indicesValue.size() + 1,
                               std::vector<Range>(axis + 1, Range::all()));
@ -71,24 +71,34 @@ public:
            }
            sliceRanges.back()[axis].start = prevSlice;
        }
-        else if (params.has("begin") && params.has("size"))
+        else if (params.has("begin"))
        {
+            CV_Assert(params.has("size") ^ params.has("end"));
            const DictValue &begins = params.get("begin");
-            const DictValue &sizes = params.get("size");
-            CV_Assert(begins.size() == sizes.size());
+            const DictValue &sizesOrEnds = params.has("size") ? params.get("size") : params.get("end");
+            CV_Assert(begins.size() == sizesOrEnds.size());

            sliceRanges.resize(1);
            sliceRanges[0].resize(begins.size(), Range::all());
            for (int i = 0; i < begins.size(); ++i)
            {
                int start = begins.get<int>(i);
-                int size = sizes.get<int>(i);
+                int sizeOrEnd = sizesOrEnds.get<int>(i);  // It may be negative to reverse indexation.
                CV_Assert(start >= 0);
-                CV_Assert(size == -1 || size > 0);  // -1 value means range [start, axis_size).

                sliceRanges[0][i].start = start;
-                if (size > 0)
-                    sliceRanges[0][i].end = start + size;
+                if (params.has("size"))
+                {
+                    int size = sizeOrEnd;
+                    CV_Assert(size == -1 || size > 0);  // -1 value means range [start, axis_size).
+                    sliceRanges[0][i].end = start > 0 ? start + size : -1;  // We'll finalize a negative value later.
+                }
+                else
+                {
+                    int end = sizeOrEnd;
+                    CV_Assert(end < 0 || end > start);  // End index is excluded.
+                    sliceRanges[0][i].end = end;  // We'll finalize a negative value later.
+                }
            }
        }
    }
@ -109,8 +119,7 @@ public:
                CV_Assert(sliceRanges[i].size() <= inpShape.size());
                for (int j = 0; j < sliceRanges[i].size(); ++j)
                {
-                    outputs[i][j] = std::min(sliceRanges[i][j].end, inpShape[j]) -
-                                    std::max(sliceRanges[i][j].start, 0);
+                    outputs[i][j] = clamp(sliceRanges[i][j], inpShape[j]).size();
                }
            }
        }
@ -152,8 +161,7 @@ public:
            // Clamp.
            for (int j = 0; j < sliceRanges[i].size(); ++j)
            {
-                sliceRanges[i][j].start = std::max(0, sliceRanges[i][j].start);
-                sliceRanges[i][j].end = std::min(sliceRanges[i][j].end, inpShape[j]);
+                sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]);
            }
            // Fill the rest of ranges.
            for (int j = sliceRanges[i].size(); j < inpShape[-1]; ++j)
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@ -617,7 +617,7 @@ struct TorchImporter : public ::cv::dnn::Importer
                curModule->modules.push_back(cv::Ptr<Module>(new Module(nnName, "Sigmoid")));
                readObject();
            }
-            else if (nnName == "SpatialBatchNormalization")
+            else if (nnName == "SpatialBatchNormalization" || nnName == "InstanceNormalization")
            {
                newModule->apiType = "BatchNorm";
                readTorchTable(scalarParams, tensorParams);
@ -626,19 +626,31 @@ struct TorchImporter : public ::cv::dnn::Importer
                float eps = float(scalarParams.get<double>("eps"));
                layerParams.set("eps", eps);

-                CV_Assert((tensorParams.count("running_var") || tensorParams.count("running_std")) &&
-                          tensorParams.count("running_mean"));
+                if (tensorParams.count("running_mean"))
+                {
                    layerParams.blobs.push_back(tensorParams["running_mean"].second);
+                }
+                else
+                {
+                    CV_Assert(scalarParams.has("nOutput"));
+                    layerParams.blobs.push_back(Mat::zeros(1, scalarParams.get<int>("nOutput"), CV_32F));
+                }
+
                if (tensorParams.count("running_var"))
                {
                    layerParams.blobs.push_back(tensorParams["running_var"].second);
                }
-                else
+                else if (tensorParams.count("running_std"))
                {
                    layerParams.blobs.push_back(tensorParams["running_std"].second);
                    pow(layerParams.blobs.back(), -2, layerParams.blobs.back());
                    subtract(layerParams.blobs.back(), eps, layerParams.blobs.back());
                }
+                else
+                {
+                    CV_Assert(scalarParams.has("nOutput"));
+                    layerParams.blobs.push_back(Mat::ones(1, scalarParams.get<int>("nOutput"), CV_32F));
+                }

                if (tensorParams.count("weight"))
                {
@ -652,6 +664,16 @@ struct TorchImporter : public ::cv::dnn::Importer
                    layerParams.blobs.push_back(tensorParams["bias"].second);
                }

+                if (nnName == "InstanceNormalization")
+                {
+                    cv::Ptr<Module> mvnModule(new Module(nnName));
+                    mvnModule->apiType = "MVN";
+                    curModule->modules.push_back(mvnModule);
+
+                    layerParams.blobs[0].setTo(0);  // batch norm's mean
+                    layerParams.blobs[1].setTo(1);  // batch norm's std
+                }
+
                curModule->modules.push_back(newModule);
            }
            else if (nnName == "PReLU")
@ -691,7 +713,9 @@ struct TorchImporter : public ::cv::dnn::Importer
                layerParams.set("scale", scale);
                curModule->modules.push_back(newModule);
            }
-            else if (nnName == "Identity")
+            // TotalVariation layer is from fast-neural-style project: https://github.com/jcjohnson/fast-neural-style
+            // It's a loss function that has an Identity forward.
+            else if (nnName == "Identity" || nnName == "TotalVariation")
            {
                readTorchTable(scalarParams, tensorParams);
                newModule->apiType = "Identity";
@ -866,7 +890,7 @@ struct TorchImporter : public ::cv::dnn::Importer
                layerParams.set("scale", scalarParams.get<float>("constant_scalar"));
                curModule->modules.push_back(newModule);
            }
-            else if (nnName == "SpatialZeroPadding")
+            else if (nnName == "SpatialZeroPadding" || nnName == "SpatialReflectionPadding")
            {
                readTorchTable(scalarParams, tensorParams);
                CV_Assert(scalarParams.has("pad_l"), scalarParams.has("pad_r"),
@ -889,6 +913,26 @@ struct TorchImporter : public ::cv::dnn::Importer
                paddings[5] = padRight;
                layerParams.set("paddings", DictValue::arrayInt<int*>(&paddings[0], paddings.size()));
                layerParams.set("input_dims", 3);
+
+                if (nnName == "SpatialReflectionPadding")
+                    layerParams.set("type", "reflect");
+
+                curModule->modules.push_back(newModule);
+            }
+            else if (nnName == "ShaveImage")
+            {
+                // ShaveImage layer is from fast-neural-style project: https://github.com/jcjohnson/fast-neural-style
+                // It may be mapped to Slice layer.
+                readTorchTable(scalarParams, tensorParams);
+                CV_Assert(scalarParams.has("size"));
+                int size = scalarParams.get<int>("size");
+
+                int begins[] = {0, 0, size, size};
+                int ends[] = {-1, -1, -size - 1, -size - 1};
+
+                newModule->apiType = "Slice";
+                layerParams.set("begin", DictValue::arrayInt<int*>(&begins[0], 4));
+                layerParams.set("end", DictValue::arrayInt<int*>(&ends[0], 4));
                curModule->modules.push_back(newModule);
            }
            else
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -231,6 +231,7 @@ TEST(Torch_Importer, net_padding)
 {
    runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true);
    runTorchNet("net_spatial_zero_padding", DNN_TARGET_CPU, "", false, true);
+    runTorchNet("net_spatial_reflection_padding", DNN_TARGET_CPU, "", false, true);
 }

 TEST(Torch_Importer, ENet_accuracy)
@ -338,6 +339,49 @@ OCL_TEST(Torch_Importer, ENet_accuracy)
    }
 }

+// Check accuracy of style transfer models from https://github.com/jcjohnson/fast-neural-style
+// th fast_neural_style.lua \
+//   -input_image ~/opencv_extra/testdata/dnn/googlenet_1.png \
+//   -output_image lena.png \
+//   -median_filter 0 \
+//   -image_size 0 \
+//   -model models/eccv16/starry_night.t7
+// th fast_neural_style.lua \
+//   -input_image ~/opencv_extra/testdata/dnn/googlenet_1.png \
+//   -output_image lena.png \
+//   -median_filter 0 \
+//   -image_size 0 \
+//   -model models/instance_norm/feathers.t7
+TEST(Torch_Importer, FastNeuralStyle_accuracy)
+{
+    std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7",
+                            "dnn/fast_neural_style_instance_norm_feathers.t7"};
+    std::string targets[] = {"dnn/lena_starry_night.png", "dnn/lena_feathers.png"};
+
+    for (int i = 0; i < 2; ++i)
+    {
+        const string model = findDataFile(models[i], false);
+        Net net = readNetFromTorch(model);
+
+        Mat img = imread(findDataFile("dnn/googlenet_1.png", false));
+        Mat inputBlob = blobFromImage(img, 1.0, Size(), Scalar(103.939, 116.779, 123.68), false);
+
+        net.setInput(inputBlob);
+        Mat out = net.forward();
+
+        // Deprocessing.
+        getPlane(out, 0, 0) += 103.939;
+        getPlane(out, 0, 1) += 116.779;
+        getPlane(out, 0, 2) += 123.68;
+        out = cv::min(cv::max(0, out), 255);
+
+        Mat ref = imread(findDataFile(targets[i]));
+        Mat refBlob = blobFromImage(ref, 1.0, Size(), Scalar(), false);
+
+        normAssert(out, refBlob, "", 0.5, 1.1);
+    }
+}
+
 }

 #endif
--- a/samples/dnn/fast_neural_style.py
+++ b/samples/dnn/fast_neural_style.py
@ -0,0 +1,51 @@
+import cv2 as cv
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser(
+        description='This script is used to run style transfer models from '
+                    'https://github.com/jcjohnson/fast-neural-style using OpenCV')
+parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
+parser.add_argument('--model', help='Path to .t7 model')
+parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.')
+parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.')
+parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.')
+args = parser.parse_args()
+
+net = cv.dnn.readNetFromTorch(args.model)
+
+if args.input:
+    cap = cv.VideoCapture(args.input)
+else:
+    cap = cv.VideoCapture(0)
+
+cv.namedWindow('Styled image', cv.WINDOW_NORMAL)
+while cv.waitKey(1) < 0:
+    hasFrame, frame = cap.read()
+    if not hasFrame:
+        cv.waitKey()
+        break
+
+    inWidth = args.width if args.width != -1 else frame.shape[1]
+    inHeight = args.height if args.height != -1 else frame.shape[0]
+    inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight),
+                              (103.939, 116.779, 123.68), swapRB=False, crop=False)
+
+    net.setInput(inp)
+    out = net.forward()
+
+    out = out.reshape(3, out.shape[2], out.shape[3])
+    out[0] += 103.939
+    out[1] += 116.779
+    out[2] += 123.68
+    out /= 255
+    out = out.transpose(1, 2, 0)
+
+    t, _ = net.getPerfProfile()
+    freq = cv.getTickFrequency() / 1000
+    print t / freq, 'ms'
+
+    if args.median_filter:
+        out = cv.medianBlur(out, args.median_filter)
+
+    cv.imshow('Styled image', out)