opencv/3rdparty/carotene/src/dot_product.cpp

/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"

namespace CAROTENE_NS {

f64 dotProduct(const Size2D &_size,
               const u8 * src0Base, ptrdiff_t src0Stride,
               const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        uint64x2_t ws = vmovq_n_u64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            uint32x4_t s1 = vmovq_n_u32(0);
            uint32x4_t s2 = vmovq_n_u32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                uint8x16_t vs1 = vld1q_u8(src0 + i);
                uint8x16_t vs2 = vld1q_u8(src1 + i);

                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));

                s1 = vpadalq_u16(s1, vdot1);
                s2 = vpadalq_u16(s2, vdot2);
            }

            ws = vpadalq_u32(ws, s1);
            ws = vpadalq_u32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            uint8x8_t vs1 = vld1_u8(src0 + i);
            uint8x8_t vs2 = vld1_u8(src1 + i);

            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

f64 dotProduct(const Size2D &_size,
               const s8 * src0Base, ptrdiff_t src0Stride,
               const s8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        int64x2_t ws = vmovq_n_s64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            int32x4_t s1 = vmovq_n_s32(0);
            int32x4_t s2 = vmovq_n_s32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                int8x16_t vs1 = vld1q_s8(src0 + i);
                int8x16_t vs2 = vld1q_s8(src1 + i);

                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));

                s1 = vpadalq_s16(s1, vdot1);
                s2 = vpadalq_s16(s2, vdot2);
            }

            ws = vpadalq_s32(ws, s1);
            ws = vpadalq_s32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            int8x8_t vs1 = vld1_s8(src0 + i);
            int8x8_t vs2 = vld1_s8(src1 + i);

            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

f64 dotProduct(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
    {
        size.width *= size.height;
        size.height = 1;
    }

#define DOT_FLOAT_BLOCKSIZE (1 << 13)
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        while(i + 4 <= size.width)
        {
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
            float32x4_t v_sum = vdupq_n_f32(0.0f);

            for( ; i <= lim; i += 4 )
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
            }

            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
        }

        if(i + 2 <= size.width)
        {
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
            i += 2;
        }

        for (; i < size.width; ++i)
            result += src0[i] * src1[i];
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

} // namespace CAROTENE_NS
Add Carotene - NVIDIA Hardware-Abstraction-Layer for ARM platforms 2016-07-05 14:56:15 +08:00			`/*`
			`* By downloading, copying, installing or using the software you agree to this license.`
			`* If you do not agree to this license, do not download, install,`
			`* copy or use the software.`
			`*`
			`*`
			`* License Agreement`
			`* For Open Source Computer Vision Library`
			`* (3-clause BSD License)`
			`*`
			`* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.`
			`* Third party copyrights are property of their respective owners.`
			`*`
			`* Redistribution and use in source and binary forms, with or without modification,`
			`* are permitted provided that the following conditions are met:`
			`*`
			`* * Redistributions of source code must retain the above copyright notice,`
			`* this list of conditions and the following disclaimer.`
			`*`
			`* * Redistributions in binary form must reproduce the above copyright notice,`
			`* this list of conditions and the following disclaimer in the documentation`
			`* and/or other materials provided with the distribution.`
			`*`
			`* * Neither the names of the copyright holders nor the names of the contributors`
			`* may be used to endorse or promote products derived from this software`
			`* without specific prior written permission.`
			`*`
			`* This software is provided by the copyright holders and contributors "as is" and`
			`* any express or implied warranties, including, but not limited to, the implied`
			`* warranties of merchantability and fitness for a particular purpose are disclaimed.`
			`* In no event shall copyright holders or contributors be liable for any direct,`
			`* indirect, incidental, special, exemplary, or consequential damages`
			`* (including, but not limited to, procurement of substitute goods or services;`
			`* loss of use, data, or profits; or business interruption) however caused`
			`* and on any theory of liability, whether in contract, strict liability,`
			`* or tort (including negligence or otherwise) arising in any way out of`
			`* the use of this software, even if advised of the possibility of such damage.`
			`*/`

			`#include "common.hpp"`

			`namespace CAROTENE_NS {`

			`f64 dotProduct(const Size2D &_size,`
			`const u8 * src0Base, ptrdiff_t src0Stride,`
			`const u8 * src1Base, ptrdiff_t src1Stride)`
			`{`
			`internal::assertSupportedConfiguration();`
			`#ifdef CAROTENE_NEON`
			`Size2D size(_size);`
			`if (src0Stride == src1Stride &&`
			`src0Stride == (ptrdiff_t)(size.width))`
			`{`
			`size.width *= size.height;`
			`size.height = 1;`
			`}`

			`// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow`
			`// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements`
			`#define DOT_UINT_BLOCKSIZE 66050*8`
			`f64 result = 0.0;`
			`for (size_t row = 0; row < size.height; ++row)`
			`{`
			`const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);`
			`const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);`

			`size_t i = 0;`
			`uint64x2_t ws = vmovq_n_u64(0);`

			`while(i + 16 <= size.width)`
			`{`
			`size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;`

			`uint32x4_t s1 = vmovq_n_u32(0);`
			`uint32x4_t s2 = vmovq_n_u32(0);`

			`for (; i <= lim; i += 16)`
			`{`
			`internal::prefetch(src0 + i);`
			`internal::prefetch(src1 + i);`

			`uint8x16_t vs1 = vld1q_u8(src0 + i);`
			`uint8x16_t vs2 = vld1q_u8(src1 + i);`

			`uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));`
			`uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));`

			`s1 = vpadalq_u16(s1, vdot1);`
			`s2 = vpadalq_u16(s2, vdot2);`
			`}`

			`ws = vpadalq_u32(ws, s1);`
			`ws = vpadalq_u32(ws, s2);`
			`}`

			`if(i + 8 <= size.width)`
			`{`
			`uint8x8_t vs1 = vld1_u8(src0 + i);`
			`uint8x8_t vs2 = vld1_u8(src1 + i);`

			`ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));`
			`i += 8;`
			`}`

			`result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);`

			`for (; i < size.width; ++i)`
			`result += s32(src0[i]) * s32(src1[i]);`
			`}`
			`return result;`
			`#else`
			`(void)_size;`
			`(void)src0Base;`
			`(void)src0Stride;`
			`(void)src1Base;`
			`(void)src1Stride;`

			`return 0;`
			`#endif`
			`}`

			`f64 dotProduct(const Size2D &_size,`
			`const s8 * src0Base, ptrdiff_t src0Stride,`
			`const s8 * src1Base, ptrdiff_t src1Stride)`
			`{`
			`internal::assertSupportedConfiguration();`
			`#ifdef CAROTENE_NEON`
			`Size2D size(_size);`
			`if (src0Stride == src1Stride &&`
			`src0Stride == (ptrdiff_t)(size.width))`
			`{`
			`size.width *= size.height;`
			`size.height = 1;`
			`}`

			`// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow`
			`// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements`
			`#define DOT_INT_BLOCKSIZE 131070*8`
			`f64 result = 0.0;`
			`for (size_t row = 0; row < size.height; ++row)`
			`{`
			`const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);`
			`const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);`

			`size_t i = 0;`
			`int64x2_t ws = vmovq_n_s64(0);`

			`while(i + 16 <= size.width)`
			`{`
			`size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;`

			`int32x4_t s1 = vmovq_n_s32(0);`
			`int32x4_t s2 = vmovq_n_s32(0);`

			`for (; i <= lim; i += 16)`
			`{`
			`internal::prefetch(src0 + i);`
			`internal::prefetch(src1 + i);`

			`int8x16_t vs1 = vld1q_s8(src0 + i);`
			`int8x16_t vs2 = vld1q_s8(src1 + i);`

			`int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));`
			`int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));`

			`s1 = vpadalq_s16(s1, vdot1);`
			`s2 = vpadalq_s16(s2, vdot2);`
			`}`

			`ws = vpadalq_s32(ws, s1);`
			`ws = vpadalq_s32(ws, s2);`
			`}`

			`if(i + 8 <= size.width)`
			`{`
			`int8x8_t vs1 = vld1_s8(src0 + i);`
			`int8x8_t vs2 = vld1_s8(src1 + i);`

			`ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));`
			`i += 8;`
			`}`

			`result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);`

			`for (; i < size.width; ++i)`
			`result += s32(src0[i]) * s32(src1[i]);`
			`}`
			`return result;`
			`#else`
			`(void)_size;`
			`(void)src0Base;`
			`(void)src0Stride;`
			`(void)src1Base;`
			`(void)src1Stride;`

			`return 0;`
			`#endif`
			`}`

			`f64 dotProduct(const Size2D &_size,`
			`const f32 * src0Base, ptrdiff_t src0Stride,`
			`const f32 * src1Base, ptrdiff_t src1Stride)`
			`{`
			`internal::assertSupportedConfiguration();`
			`#ifdef CAROTENE_NEON`
			`Size2D size(_size);`
			`if (src0Stride == src1Stride &&`
			`src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))`
			`{`
			`size.width *= size.height;`
			`size.height = 1;`
			`}`

			`#define DOT_FLOAT_BLOCKSIZE (1 << 13)`
			`f64 result = 0.0;`
			`for (size_t row = 0; row < size.height; ++row)`
			`{`
			`const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);`
			`const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);`

			`size_t i = 0;`
			`while(i + 4 <= size.width)`
			`{`
			`size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;`
			`float32x4_t v_sum = vdupq_n_f32(0.0f);`

			`for( ; i <= lim; i += 4 )`
			`{`
			`internal::prefetch(src0 + i);`
			`internal::prefetch(src1 + i);`
			`v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));`
			`}`

			`float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));`
			`result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);`
			`}`

			`if(i + 2 <= size.width)`
			`{`
			`float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));`
			`result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);`
			`i += 2;`
			`}`

			`for (; i < size.width; ++i)`
			`result += src0[i] * src1[i];`
			`}`
			`return result;`
			`#else`
			`(void)_size;`
			`(void)src0Base;`
			`(void)src0Stride;`
			`(void)src1Base;`
			`(void)src1Stride;`

			`return 0;`
			`#endif`
			`}`

			`} // namespace CAROTENE_NS`