From 8108bd30febf17e81f8329ccb65f695dd335a471 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 27 Sep 2012 12:44:06 +0400
Subject: [PATCH] optimize memory usage

---
 modules/gpu/src/cuda/isf-sc.cu  | 306 +++++++++++++-------------------
 modules/gpu/src/icf.hpp         |  29 +--
 modules/gpu/src/softcascade.cpp |  86 +++++----
 3 files changed, 180 insertions(+), 241 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 714bdfa44a..c8dff34bdc 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -42,18 +42,17 @@
 
 #include <opencv2/gpu/device/common.hpp>
 #include <icf.hpp>
-// #include <opencv2/gpu/device/saturate_cast.hpp>
 #include <stdio.h>
-// #include <float.h>
+#include <float.h>
 
-// //#define LOG_CUDA_CASCADE
+// #define LOG_CUDA_CASCADE
 
-// #if defined LOG_CUDA_CASCADE
-// # define dprintf(format, ...) \
-//             do { printf(format, __VA_ARGS__); } while (0)
-// #else
-// # define dprintf(format, ...)
-// #endif
+#if defined LOG_CUDA_CASCADE
+# define dprintf(format, ...) \
+            do { printf(format, __VA_ARGS__); } while (0)
+#else
+# define dprintf(format, ...)
+#endif
 
 namespace cv { namespace gpu { namespace device {
 namespace icf {
@@ -94,32 +93,128 @@ namespace icf {
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    texture<float2,  cudaTextureType1D, cudaReadModeElementType> tnode;
+    texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
+    // ToDo: do it in load time
+    // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+    // {
+    //     scaledRect = node.rect;
+    //     return (float)(node.threshold & 0x0FFFFFFFU);
+    // }
+
+    __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+    {
+        float relScale = level.relScale;
+        float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y,
+            scaledRect.z, scaledRect.w);
+        dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1],
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        // rescale
+        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+        float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        float approx = 1.f;
+        // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+        {
+            const float expected_new_area = farea * relScale * relScale;
+            approx =  sarea / expected_new_area;
+        }
+
+        dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", (node.threshold >> 28),
+        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+
+
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
+        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
+
+        dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        return rootThreshold;
+    }
+
+    __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area)
+    {
+
+        dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+        dprintf("get for channel %d\n", channel);
+        dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+            x + area.x, y + area.w);
+        dprintf("at point %d %d with offset %d\n", x, y, 0);
+
+        int offset = channel * 121;
+        y += offset;
+
+        int a = tex2D(thogluv, x + area.x, y + area.y);
+        int b = tex2D(thogluv, x + area.z, y + area.y);
+        int c = tex2D(thogluv, x + area.z, y + area.w);
+        int d = tex2D(thogluv, x + area.x, y + area.w);
+
+        dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+
+        return (a - b + c - d);
+    }
+
     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes,
-        PtrStepSz<uchar4> objects)
+        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         Level level = levels[blockIdx.z];
+
+        // if (x > 0 || y > 0 || blockIdx.z > 0) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
         Octave octave = octaves[level.octave];
+
         int st = octave.index * octave.stages;
         const int stEnd = st + 1000;//octave.stages;
 
         float confidence = 0.f;
 
-#pragma unroll 8
+// #pragma unroll 8
         for(; st < stEnd; ++st)
         {
+            dprintf("\n\nstage: %d\n", st);
             const int nId = st * 3;
-            const Node node = nodes[nId];
+            Node node = nodes[nId];
 
-            const float stage = stages[st];
-            confidence += node.rect.x * stage;
+            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+
+            float threshold = rescale(level, node.rect, node);
+            int sum = get(x, y, (node.threshold >> 28), node.rect);
+
+            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+                node.rect.w, threshold);
+
+            int next = 1 + (int)(sum >= threshold);
+            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+
+            node = nodes[nId + next];
+            threshold = rescale(level, node.rect, node);
+            sum = get(x, y, (node.threshold >> 28), node.rect);
+
+            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+            float impact = leaves[st * 4 + lShift];
+            confidence += impact;
+
+            if (confidence <= stages[st]) st = stEnd + 1;
+            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+            dprintf("extracted stage: %f\n", stages[st]);
+            dprintf("computed  score: %f\n\n", confidence);
         }
 
+        // if (st == stEnd)
+        //     printf("%d %d %d\n", x, y, st);
+
         uchar4 val;
         val.x = (int)confidence;
         if (x == y) objects(0, threadIdx.x) = val;
@@ -127,188 +222,27 @@ namespace icf {
     }
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzb& features,
-        PtrStepSz<uchar4> objects)
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects)
     {
         int fw = 160;
         int fh = 120;
+
         dim3 block(32, 8);
         dim3 grid(fw / 32, fh / 8, 47);
+
         const Level* l = (const Level*)levels.ptr();
         const Octave* oct = ((const Octave*)octaves.ptr());
         const float* st = (const float*)stages.ptr();
         const Node* nd = (const Node*)nodes.ptr();
-        // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) );
+        const float* lf = (const float*)leaves.ptr();
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, objects);
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
 }
-}}}
-
-// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
-//     PtrStepSz<uchar4> objects)
-// {
-//     cascade.detectAt(hogluv, pitch, objects);
-// }
-
-// }
-
-// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
-//                                      const int channel, const float threshold) const
-// {
-//     dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-//     dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
-
-//     float relScale = level.relScale;
-//     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-//     // rescale
-//     scaledRect.x = __float2int_rn(relScale * scaledRect.x);
-//     scaledRect.y = __float2int_rn(relScale * scaledRect.y);
-//     scaledRect.z = __float2int_rn(relScale * scaledRect.z);
-//     scaledRect.w = __float2int_rn(relScale * scaledRect.w);
-
-//     float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-
-//     float approx = 1.f;
-//     if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-//     {
-//         const float expected_new_area = farea * relScale * relScale;
-//         approx = expected_new_area / sarea;
-//     }
-
-//     dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
-//         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-//     // compensation areas rounding
-//     float rootThreshold = threshold / approx;
-//     // printf("    approx %f\n", rootThreshold);
-//     rootThreshold *= level.scaling[(int)(channel > 6)];
-
-//     dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
-
-//     return rootThreshold;
-// }
-
-// typedef unsigned char uchar;
-// float __device get(const int* __restrict__ hogluv, const int pitch,
-//                    const int x, const int y, int channel, uchar4 area)
-// {
-//     dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-//     dprintf("get for channel %d\n", channel);
-//     dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
-//         x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-//         x + area.x, y + area.w);
-//     dprintf("at point %d %d with offset %d\n", x, y, 0);
-
-//     const int* curr = hogluv + ((channel * 121) + y) * pitch;
-
-//     int a = curr[area.y * pitch + x + area.x];
-//     int b = curr[area.y * pitch + x + area.z];
-//     int c = curr[area.w * pitch + x + area.z];
-//     int d = curr[area.w * pitch + x + area.x];
-
-//     dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
-
-//     return (a - b + c - d);
-// }
-
-
-// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
-//                                     PtrStepSz<uchar4>& objects) const
-// {
-//     const icf::Level* lls = (const icf::Level*)levels.ptr();
-
-//     const int y = blockIdx.y * blockDim.y + threadIdx.y;
-//     const int x = blockIdx.x * blockDim.x + threadIdx.x;
-//     // if (x > 0 || y > 0) return;
-
-//     Level level = lls[blockIdx.z];
-//     if (x >= level.workRect.x || y >= level.workRect.y) return;
-
-//     dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-//         level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
-
-//     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
-//     // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
-//     //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
-
-//     const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
-
-//     float detectionScore = 0.f;
-
-//     int st = stBegin;
-//     for(; st < stEnd; ++st)
-//     {
-//         const float stage = stages(0, st);
-//         dprintf("Stage: %f\n", stage);
-//         {
-//             const int nId = st * 3;
-
-//             // work with root node
-//             const Node node = ((const Node*)nodes.ptr())[nId];
-
-//             dprintf("Node: %d %f\n", node.feature, node.threshold);
-
-//             const Feature feature = ((const Feature*)features.ptr())[node.feature];
-
-//             uchar4 scaledRect = feature.rect;
-//             float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
-
-//             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
-
-//             dprintf("root feature %d %f\n",feature.channel, sum);
-
-//             int next = 1 + (int)(sum >= threshold);
-
-//             dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
-
-//             // leaves
-//             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
-//             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
-
-//             scaledRect = fLeaf.rect;
-//             threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
-//             sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
-
-//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-//             float impact = leaves(0, (st * 4) + lShift);
-
-//             detectionScore += impact;
-
-//             dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-//             dprintf("extracted stage:\n");
-//             dprintf("ct %f\n", stage);
-//             dprintf("computed score %f\n\n", detectionScore);
-//             dprintf("\n\n");
-//         }
-
-//         if (detectionScore <= stage || st - stBegin == 100) break;
-//     }
-
-//     dprintf("x %d y %d: %d\n", x, y, st - stBegin);
-
-//     if (st == stEnd)
-//     {
-//         uchar4 a;
-//         a.x = level.workRect.x;
-//         a.y = level.workRect.y;
-//         objects(0, threadIdx.x) = a;
-//     }
-// }
-
-// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
-// {
-//     dim3 block(32, 8, 1);
-//     dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
-//     device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
-//     cudaSafeCall( cudaGetLastError() );
-//     if (!stream)
-//         cudaSafeCall( cudaDeviceSynchronize() );
-// }
-
-// }}
\ No newline at end of file
+}}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 51ea2c068a..ecd1886d38 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -40,11 +40,13 @@
 //
 //M
 
-#include <opencv2/gpu/device/common.hpp>
 
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__
 
+#include <opencv2/gpu/device/common.hpp>
+#include <stdio.h>
+
 // #if defined __CUDACC__
 // # define __device __device__ __forceinline__
 // #else
@@ -92,20 +94,27 @@ struct __align__(8) Level //is actually 24 bytes
 
 struct __align__(8) Node
 {
-    // int feature;
     uchar4 rect;
-    float threshold;
+    // ushort channel;
+    uint threshold;
 
-    Node(const uchar4 c, const int t) : rect(c), threshold(t) {}
+    enum { THRESHOLD_MASK = 0x0FFFFFFF };
+
+    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28))
+    {
+        // printf("%d\n", t);
+        // printf("[%d %d %d %d] %d, %d\n",rect.x, rect.y, rect.z, rect.w, (int)(threshold >> 28),
+        //     (int)(0x0FFFFFFF & threshold));
+    }
 };
 
-struct __align__(8) Feature
-{
-    int channel;
-    uchar4 rect;
+// struct __align__(8) Feature
+// {
+//     int channel;
+//     uchar4 rect;
 
-    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-};
+//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+// };
 }
 }}}
 // struct Cascade
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8d75176ab3..ffbf380c69 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -60,19 +60,10 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzb& features,
-        PtrStepSz<uchar4> objects);
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects);
 }
 }}}
 
-// namespace {
-//     char *itoa(long i, char* s, int /*dummy_radix*/)
-//     {
-//         sprintf(s, "%ld", i);
-//         return s;
-//     }
-// }
-
 struct cv::gpu::SoftCascade::Filds
 {
 
@@ -97,7 +88,6 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat stages;
     GpuMat nodes;
     GpuMat leaves;
-    GpuMat features;
     GpuMat levels;
 
     // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
@@ -137,7 +127,7 @@ struct cv::gpu::SoftCascade::Filds
     bool fill(const FileNode &root, const float mins, const float maxs);
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
-        device::icf::detect(levels, octaves, stages, nodes, features, objects);
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects);
     }
 
 private:
@@ -216,10 +206,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         if (fn.empty()) return false;
 
     std::vector<Octave>  voctaves;
-    std::vector<float>        vstages;
+    std::vector<float>   vstages;
     std::vector<Node>    vnodes;
-    std::vector<float>        vleaves;
-    std::vector<Feature> vfeatures;
+    std::vector<float>   vleaves;
     scales.clear();
 
     FileNodeIterator it = fn.begin(), it_end = fn.end();
@@ -245,6 +234,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         FileNode ffs = fns[SC_FEATURES];
         if (ffs.empty()) return false;
 
+        FileNodeIterator ftrs = ffs.begin();
+
         fns = fns[SC_STAGES];
         if (fn.empty()) return false;
 
@@ -263,10 +254,21 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
                 FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
                 for (; inIt != inIt_end;)
                 {
-                    int feature = (int)(*(inIt +=2)++) + feature_offset;
-                    float th = (float)(*(inIt++));
+                    // int feature = (int)(*(inIt +=2)) + feature_offset;
+                    inIt +=3;
+                    // extract feature, Todo:check it
+                    uint th = saturate_cast<uint>((float)(*(inIt++)));
+                    cv::FileNode ftn = (*ftrs)[SC_F_RECT];
+                    cv::FileNodeIterator r_it = ftn.begin();
                     uchar4 rect;
-                    vnodes.push_back(Node(rect, th));
+                    rect.x = saturate_cast<uchar>((int)*(r_it++));
+                    rect.y = saturate_cast<uchar>((int)*(r_it++));
+                    rect.z = saturate_cast<uchar>((int)*(r_it++));
+                    rect.w = saturate_cast<uchar>((int)*(r_it++));
+
+                    uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
+                    vnodes.push_back(Node(rect, channel, th));
+                    ++ftrs;
                 }
 
                 fns = (*ftr)[SC_LEAF];
@@ -276,19 +278,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
             }
         }
 
-        st = ffs.begin(), st_end = ffs.end();
-        for (; st != st_end; ++st )
-        {
-            cv::FileNode rn = (*st)[SC_F_RECT];
-            cv::FileNodeIterator r_it = rn.begin();
-            uchar4 rect;
-            rect.x = saturate_cast<uchar>((int)*(r_it++));
-            rect.y = saturate_cast<uchar>((int)*(r_it++));
-            rect.z = saturate_cast<uchar>((int)*(r_it++));
-            rect.w = saturate_cast<uchar>((int)*(r_it++));
-            vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect));
-        }
-
         feature_offset += octave.stages * 3;
         ++octIndex;
     }
@@ -306,9 +295,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     leaves.upload(cv::Mat(vleaves).reshape(1,1));
     CV_Assert(!leaves.empty());
 
-    features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
-    CV_Assert(!features.empty());
-
     // compute levels
     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
     CV_Assert(!levels.empty());
@@ -425,7 +411,14 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
-// #define USE_REFERENCE_VALUES
+#define USE_REFERENCE_VALUES
+namespace {
+    char *itoa(long i, char* s, int /*dummy_radix*/)
+    {
+        sprintf(s, "%ld", i);
+        return s;
+    }
+}
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                 GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
@@ -438,17 +431,20 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     Filds& flds = *filds;
 
 #if defined USE_REFERENCE_VALUES
-//     cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-//     cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-//     char buff[33];
+    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
 
-//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-//     {
-//         cv::Mat channel;
-//         imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-//         GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-//         gchannel.upload(channel);
-//     }
+    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+    char buff[33];
+
+    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+    {
+        cv::Mat channel;
+        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+
+        // std::cout << "channel " << i << std::endl << channel << std::endl;
+        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
 #else
     GpuMat& plane = flds.plane;
     GpuMat& shrunk = flds.shrunk;