/* functable.c -- Choose relevant optimized functions at runtime * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" #include "zendian.h" #include "crc32_braid_p.h" #include "deflate.h" #include "deflate_p.h" #include "functable.h" #include "cpu_features.h" #if defined(_MSC_VER) # include #endif /* Platform has pointer size atomic store */ #if defined(__GNUC__) || defined(__clang__) # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) #elif defined(_MSC_VER) # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) # if defined(_M_ARM) || defined(_M_ARM64) # define FUNCTABLE_BARRIER() do { \ _ReadWriteBarrier(); \ __dmb(0xB); /* _ARM_BARRIER_ISH */ \ _ReadWriteBarrier(); \ } while (0) # else # define FUNCTABLE_BARRIER() _ReadWriteBarrier() # endif #else # warning Unable to detect atomic intrinsic support. # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) #endif static void force_init_empty(void) { // empty } static void init_functable(void) { struct functable_s ft; struct cpu_features cf; cpu_check_features(&cf); // Generic code ft.force_init = &force_init_empty; ft.adler32 = &adler32_c; ft.adler32_fold_copy = &adler32_fold_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; ft.chunksize = &chunksize_c; ft.crc32 = &PREFIX(crc32_braid); ft.crc32_fold = &crc32_fold_c; ft.crc32_fold_copy = &crc32_fold_copy_c; ft.crc32_fold_final = &crc32_fold_final_c; ft.crc32_fold_reset = &crc32_fold_reset_c; ft.inflate_fast = &inflate_fast_c; ft.insert_string = &insert_string_c; ft.quick_insert_string = &quick_insert_string_c; ft.slide_hash = &slide_hash_c; ft.update_hash = &update_hash_c; #if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) ft.longest_match = &longest_match_unaligned_64; ft.longest_match_slow = &longest_match_slow_unaligned_64; ft.compare256 = &compare256_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) ft.longest_match = &longest_match_unaligned_32; ft.longest_match_slow = &longest_match_slow_unaligned_32; ft.compare256 = &compare256_unaligned_32; # else ft.longest_match = &longest_match_unaligned_16; ft.longest_match_slow = &longest_match_slow_unaligned_16; ft.compare256 = &compare256_unaligned_16; # endif #else ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; ft.compare256 = &compare256_c; #endif // Select arch-optimized functions // X86 - SSE2 #ifdef X86_SSE2 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (cf.x86.has_sse2) # endif { ft.chunkmemset_safe = &chunkmemset_safe_sse2; ft.chunksize = &chunksize_sse2; ft.inflate_fast = &inflate_fast_sse2; ft.slide_hash = &slide_hash_sse2; # ifdef HAVE_BUILTIN_CTZ ft.compare256 = &compare256_sse2; ft.longest_match = &longest_match_sse2; ft.longest_match_slow = &longest_match_slow_sse2; # endif } #endif // X86 - SSSE3 #ifdef X86_SSSE3 if (cf.x86.has_ssse3) { ft.adler32 = &adler32_ssse3; # ifdef X86_SSE2 ft.chunkmemset_safe = &chunkmemset_safe_ssse3; ft.inflate_fast = &inflate_fast_ssse3; # endif } #endif // X86 - SSE4.2 #ifdef X86_SSE42 if (cf.x86.has_sse42) { ft.adler32_fold_copy = &adler32_fold_copy_sse42; ft.insert_string = &insert_string_sse42; ft.quick_insert_string = &quick_insert_string_sse42; ft.update_hash = &update_hash_sse42; } #endif // X86 - PCLMUL #ifdef X86_PCLMULQDQ_CRC if (cf.x86.has_pclmulqdq) { ft.crc32 = &crc32_pclmulqdq; ft.crc32_fold = &crc32_fold_pclmulqdq; ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy; ft.crc32_fold_final = &crc32_fold_pclmulqdq_final; ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset; } #endif // X86 - AVX #ifdef X86_AVX2 if (cf.x86.has_avx2) { ft.adler32 = &adler32_avx2; ft.adler32_fold_copy = &adler32_fold_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; ft.chunksize = &chunksize_avx2; ft.inflate_fast = &inflate_fast_avx2; ft.slide_hash = &slide_hash_avx2; # ifdef HAVE_BUILTIN_CTZ ft.compare256 = &compare256_avx2; ft.longest_match = &longest_match_avx2; ft.longest_match_slow = &longest_match_slow_avx2; # endif } #endif #ifdef X86_AVX512 if (cf.x86.has_avx512) { ft.adler32 = &adler32_avx512; ft.adler32_fold_copy = &adler32_fold_copy_avx512; } #endif #ifdef X86_AVX512VNNI if (cf.x86.has_avx512vnni) { ft.adler32 = &adler32_avx512_vnni; ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; } #endif // X86 - VPCLMULQDQ #if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) { ft.crc32 = &crc32_vpclmulqdq; ft.crc32_fold = &crc32_fold_vpclmulqdq; ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final; ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset; } #endif // ARM - SIMD #ifdef ARM_SIMD # ifndef ARM_NOCHECK_SIMD if (cf.arm.has_simd) # endif { ft.slide_hash = &slide_hash_armv6; } #endif // ARM - NEON #ifdef ARM_NEON # ifndef ARM_NOCHECK_NEON if (cf.arm.has_neon) # endif { ft.adler32 = &adler32_neon; ft.chunkmemset_safe = &chunkmemset_safe_neon; ft.chunksize = &chunksize_neon; ft.inflate_fast = &inflate_fast_neon; ft.slide_hash = &slide_hash_neon; # ifdef HAVE_BUILTIN_CTZLL ft.compare256 = &compare256_neon; ft.longest_match = &longest_match_neon; ft.longest_match_slow = &longest_match_slow_neon; # endif } #endif // ARM - ACLE #ifdef ARM_ACLE if (cf.arm.has_crc32) { ft.crc32 = &crc32_acle; ft.insert_string = &insert_string_acle; ft.quick_insert_string = &quick_insert_string_acle; ft.update_hash = &update_hash_acle; } #endif // Power - VMX #ifdef PPC_VMX if (cf.power.has_altivec) { ft.adler32 = &adler32_vmx; ft.slide_hash = &slide_hash_vmx; } #endif // Power8 - VSX #ifdef POWER8_VSX if (cf.power.has_arch_2_07) { ft.adler32 = &adler32_power8; ft.chunkmemset_safe = &chunkmemset_safe_power8; ft.chunksize = &chunksize_power8; ft.inflate_fast = &inflate_fast_power8; ft.slide_hash = &slide_hash_power8; } #endif #ifdef POWER8_VSX_CRC32 if (cf.power.has_arch_2_07) ft.crc32 = &crc32_power8; #endif // Power9 #ifdef POWER9 if (cf.power.has_arch_3_00) { ft.compare256 = &compare256_power9; ft.longest_match = &longest_match_power9; ft.longest_match_slow = &longest_match_slow_power9; } #endif // RISCV - RVV #ifdef RISCV_RVV if (cf.riscv.has_rvv) { ft.adler32 = &adler32_rvv; ft.adler32_fold_copy = &adler32_fold_copy_rvv; ft.chunkmemset_safe = &chunkmemset_safe_rvv; ft.chunksize = &chunksize_rvv; ft.compare256 = &compare256_rvv; ft.inflate_fast = &inflate_fast_rvv; ft.longest_match = &longest_match_rvv; ft.longest_match_slow = &longest_match_slow_rvv; ft.slide_hash = &slide_hash_rvv; } #endif // S390 #ifdef S390_CRC32_VX if (cf.s390.has_vx) ft.crc32 = crc32_s390_vx; #endif // Assign function pointers individually for atomic operation FUNCTABLE_ASSIGN(ft, force_init); FUNCTABLE_ASSIGN(ft, adler32); FUNCTABLE_ASSIGN(ft, adler32_fold_copy); FUNCTABLE_ASSIGN(ft, chunkmemset_safe); FUNCTABLE_ASSIGN(ft, chunksize); FUNCTABLE_ASSIGN(ft, compare256); FUNCTABLE_ASSIGN(ft, crc32); FUNCTABLE_ASSIGN(ft, crc32_fold); FUNCTABLE_ASSIGN(ft, crc32_fold_copy); FUNCTABLE_ASSIGN(ft, crc32_fold_final); FUNCTABLE_ASSIGN(ft, crc32_fold_reset); FUNCTABLE_ASSIGN(ft, inflate_fast); FUNCTABLE_ASSIGN(ft, insert_string); FUNCTABLE_ASSIGN(ft, longest_match); FUNCTABLE_ASSIGN(ft, longest_match_slow); FUNCTABLE_ASSIGN(ft, quick_insert_string); FUNCTABLE_ASSIGN(ft, slide_hash); FUNCTABLE_ASSIGN(ft, update_hash); // Memory barrier for weak memory order CPUs FUNCTABLE_BARRIER(); } /* stub functions */ static void force_init_stub(void) { init_functable(); } static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { init_functable(); return functable.adler32(adler, buf, len); } static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { init_functable(); return functable.adler32_fold_copy(adler, dst, src, len); } static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) { init_functable(); return functable.chunkmemset_safe(out, dist, len, left); } static uint32_t chunksize_stub(void) { init_functable(); return functable.chunksize(); } static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { init_functable(); return functable.compare256(src0, src1); } static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { init_functable(); return functable.crc32(crc, buf, len); } static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) { init_functable(); functable.crc32_fold(crc, src, len, init_crc); } static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) { init_functable(); functable.crc32_fold_copy(crc, dst, src, len); } static uint32_t crc32_fold_final_stub(crc32_fold* crc) { init_functable(); return functable.crc32_fold_final(crc); } static uint32_t crc32_fold_reset_stub(crc32_fold* crc) { init_functable(); return functable.crc32_fold_reset(crc); } static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { init_functable(); functable.inflate_fast(strm, start); } static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) { init_functable(); functable.insert_string(s, str, count); } static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) { init_functable(); return functable.longest_match(s, cur_match); } static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) { init_functable(); return functable.longest_match_slow(s, cur_match); } static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) { init_functable(); return functable.quick_insert_string(s, str); } static void slide_hash_stub(deflate_state* s) { init_functable(); functable.slide_hash(s); } static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) { init_functable(); return functable.update_hash(s, h, val); } /* functable init */ Z_INTERNAL struct functable_s functable = { force_init_stub, adler32_stub, adler32_fold_copy_stub, chunkmemset_safe_stub, chunksize_stub, compare256_stub, crc32_stub, crc32_fold_stub, crc32_fold_copy_stub, crc32_fold_final_stub, crc32_fold_reset_stub, inflate_fast_stub, insert_string_stub, longest_match_stub, longest_match_slow_stub, quick_insert_string_stub, slide_hash_stub, update_hash_stub };