From d81cdb8e1cae1eefd94a58485f81419b9c3d93ce Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Tue, 23 May 2017 20:00:21 +0900
Subject: [PATCH] add OpenCL version of convertFp16 and test  * disable vector
 operation for now  * brush up the implementation based on comment

---
 modules/core/src/convert.cpp           | 37 +++++++++++--
 modules/core/src/opencl/halfconvert.cl | 73 ++++++++++++++++++++++++++
 modules/core/test/ocl/test_arithm.cpp  | 55 +++++++++++++++++++
 3 files changed, 162 insertions(+), 3 deletions(-)
 create mode 100644 modules/core/src/opencl/halfconvert.cl

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index f431033c09..8baef8a8d9 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -5371,6 +5371,34 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
     return k.run(2, globalsize, NULL, false);
 }
 
+static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int ddepth )
+{
+    int type = _src.type(), cn = CV_MAT_CN(type);
+
+    _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) );
+    int kercn = 1;
+    int rowsPerWI = 1;
+    String build_opt = format("-D HALF_SUPPORT -D dstT=%s -D srcT=%s -D rowsPerWI=%d%s",
+                           ddepth == CV_16S ? "half" : "float",
+                           ddepth == CV_16S ? "float" : "half",
+                           rowsPerWI,
+                           ddepth == CV_16S ? " -D FLOAT_TO_HALF " : "");
+    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    UMat dst = _dst.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
+
+    k.args(srcarg, dstarg);
+
+    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, NULL, false);
+}
+
 #endif
 
 }
@@ -5411,10 +5439,8 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
 {
     CV_INSTRUMENT_REGION()
 
-    Mat src = _src.getMat();
     int ddepth = 0;
-
-    switch( src.depth() )
+    switch( _src.depth() )
     {
     case CV_32F:
         ddepth = CV_16S;
@@ -5427,6 +5453,11 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
         return;
     }
 
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_convertFp16(_src, _dst, ddepth))
+
+    Mat src = _src.getMat();
+
     int type = CV_MAKETYPE(ddepth, src.channels());
     _dst.create( src.dims, src.size, type );
     Mat dst = _dst.getMat();
diff --git a/modules/core/src/opencl/halfconvert.cl b/modules/core/src/opencl/halfconvert.cl
new file mode 100644
index 0000000000..506df69faf
--- /dev/null
+++ b/modules/core/src/opencl/halfconvert.cl
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
+__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
+                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+{
+    int x = get_global_id(0);
+    int y0 = get_global_id(1) * rowsPerWI;
+
+    if (x < dst_cols)
+    {
+        int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset));
+        int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
+
+        for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step)
+        {
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+            __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+#ifdef FLOAT_TO_HALF
+            vstore_half(src[0], 0, dst);
+#else
+            dst[0] = vload_half(0, src);
+#endif
+        }
+    }
+}
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index b6b249865f..1164473e61 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -1614,6 +1614,60 @@ OCL_TEST_P(ConvertScaleAbs, Mat)
     }
 }
 
+//////////////////////////////// ConvertFp16 ////////////////////////////////////////////////
+
+PARAM_TEST_CASE(ConvertFp16, Channels, bool)
+{
+    int cn;
+    bool fromHalf;
+    cv::Scalar val;
+
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+
+    virtual void SetUp()
+    {
+        cn = GET_PARAM(0);
+        fromHalf = GET_PARAM(1);
+    }
+
+    void generateTestData()
+    {
+        const int stype = CV_MAKE_TYPE(fromHalf ? CV_32F : CV_16S, cn);
+        const int dtype = CV_MAKE_TYPE(fromHalf ? CV_16S : CV_32F, cn);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV
+
+        Border dstBorder = randomBorder(0, 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+    }
+
+    void Near(double threshold = 0.)
+    {
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
+    }
+
+};
+
+
+OCL_TEST_P(ConvertFp16, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::convertFp16(src_roi, dst_roi));
+        OCL_ON(cv::convertFp16(usrc_roi, udst_roi));
+
+        Near(1);
+    }
+}
+
 //////////////////////////////// ScaleAdd ////////////////////////////////////////////////
 
 typedef ArithmTestBase ScaleAdd;
@@ -1844,6 +1898,7 @@ OCL_INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(::testing::Values(CV_32F, CV_6
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Normalize, Combine(OCL_ALL_DEPTHS, Values(Channels(1)), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, InRange, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertScaleAbs, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertFp16, Combine(OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ScaleAdd, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Psnr, Combine(::testing::Values((MatDepth)CV_8U), OCL_ALL_CHANNELS, Bool()));