/* * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. */ #ifndef CAROTENE_INTRINSICS_HPP #define CAROTENE_INTRINSICS_HPP #include #include namespace CAROTENE_NS { namespace internal { /////////////// Custom NEON intrinsics /////////////////// // calculate reciprocal value inline float32x4_t vrecpq_f32(float32x4_t val) { float32x4_t reciprocal = vrecpeq_f32(val); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); return reciprocal; } inline float32x2_t vrecp_f32(float32x2_t val) { float32x2_t reciprocal = vrecpe_f32(val); reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); return reciprocal; } // caclulate sqrt value inline float32x4_t vrsqrtq_f32(float32x4_t val) { float32x4_t e = vrsqrteq_f32(val); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); return e; } inline float32x2_t vrsqrt_f32(float32x2_t val) { float32x2_t e = vrsqrte_f32(val); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); return e; } inline float32x4_t vsqrtq_f32(float32x4_t val) { return vrecpq_f32(vrsqrtq_f32(val)); } inline float32x2_t vsqrt_f32(float32x2_t val) { return vrecp_f32(vrsqrt_f32(val)); } // table lookup with the table in a 128-bit register inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b) { #ifdef __aarch64__ // AArch64 supports this natively return ::vqtbl1_u8(a, b); #else union { uint8x16_t v; uint8x8x2_t w; } u = { a }; return vtbl2_u8(u.w, b); #endif } } } #endif