diff --git a/3rdparty/libjpeg-turbo/CMakeLists.txt b/3rdparty/libjpeg-turbo/CMakeLists.txt index 73e1eee141..4dd3095f94 100644 --- a/3rdparty/libjpeg-turbo/CMakeLists.txt +++ b/3rdparty/libjpeg-turbo/CMakeLists.txt @@ -4,9 +4,9 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-6 set(VERSION_MAJOR 2) set(VERSION_MINOR 1) -set(VERSION_REVISION 2) +set(VERSION_REVISION 3) set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION}) -set(LIBJPEG_TURBO_VERSION_NUMBER 2001002) +set(LIBJPEG_TURBO_VERSION_NUMBER 2001003) string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo") if(CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -79,14 +79,13 @@ configure_file(jconfigint.h.in jconfigint.h) include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src) -set(JPEG_SOURCES - jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c jcicc.c - jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c - jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c - jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c jdmainct.c jdmarker.c - jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c - jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c - jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c) +set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c + jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c + jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c + jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c + jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c + jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c + jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c) if(WITH_ARITH_ENC OR WITH_ARITH_DEC) set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c) diff --git a/3rdparty/libjpeg-turbo/LICENSE.md b/3rdparty/libjpeg-turbo/LICENSE.md index a1cdad52fa..d753e1d76a 100644 --- a/3rdparty/libjpeg-turbo/LICENSE.md +++ b/3rdparty/libjpeg-turbo/LICENSE.md @@ -91,7 +91,7 @@ best of our understanding. The Modified (3-clause) BSD License =================================== -Copyright (C)2009-2021 D. R. Commander. All Rights Reserved.
+Copyright (C)2009-2022 D. R. Commander. All Rights Reserved.
Copyright (C)2015 Viktor Szathmáry. All Rights Reserved. Redistribution and use in source and binary forms, with or without diff --git a/3rdparty/libjpeg-turbo/src/jcapimin.c b/3rdparty/libjpeg-turbo/src/jcapimin.c index 178c55ba47..84e7ecc9a7 100644 --- a/3rdparty/libjpeg-turbo/src/jcapimin.c +++ b/3rdparty/libjpeg-turbo/src/jcapimin.c @@ -4,8 +4,8 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1994-1998, Thomas G. Lane. * Modified 2003-2010 by Guido Vollbeding. - * It was modified by The libjpeg-turbo Project to include only code relevant - * to libjpeg-turbo. + * libjpeg-turbo Modifications: + * Copyright (C) 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -52,7 +52,7 @@ jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize) { struct jpeg_error_mgr *err = cinfo->err; void *client_data = cinfo->client_data; /* ignore Purify complaint here */ - MEMZERO(cinfo, sizeof(struct jpeg_compress_struct)); + memset(cinfo, 0, sizeof(struct jpeg_compress_struct)); cinfo->err = err; cinfo->client_data = client_data; } diff --git a/3rdparty/libjpeg-turbo/src/jcarith.c b/3rdparty/libjpeg-turbo/src/jcarith.c index b6d093f70e..b1720521bf 100644 --- a/3rdparty/libjpeg-turbo/src/jcarith.c +++ b/3rdparty/libjpeg-turbo/src/jcarith.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Developed 1997-2009 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2015, 2018, D. R. Commander. + * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -338,14 +338,14 @@ emit_restart(j_compress_ptr cinfo, int restart_num) compptr = cinfo->cur_comp_info[ci]; /* DC needs no table for refinement scan */ if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) { - MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS); + memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS); /* Reset DC predictions to 0 */ entropy->last_dc_val[ci] = 0; entropy->dc_context[ci] = 0; } /* AC needs no table when not present */ if (cinfo->progressive_mode == 0 || cinfo->Se) { - MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS); + memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS); } } @@ -836,7 +836,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics) * We are fully adaptive here and need no extra * statistics gathering pass! */ - ERREXIT(cinfo, JERR_NOT_COMPILED); + ERREXIT(cinfo, JERR_NOTIMPL); /* We assume jcmaster.c already validated the progressive scan parameters. */ @@ -867,7 +867,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics) if (entropy->dc_stats[tbl] == NULL) entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS); - MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); + memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS); /* Initialize DC predictions to 0 */ entropy->last_dc_val[ci] = 0; entropy->dc_context[ci] = 0; @@ -880,7 +880,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics) if (entropy->ac_stats[tbl] == NULL) entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS); - MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); + memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS); #ifdef CALCULATE_SPECTRAL_CONDITIONING if (cinfo->progressive_mode) /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */ diff --git a/3rdparty/libjpeg-turbo/src/jchuff.c b/3rdparty/libjpeg-turbo/src/jchuff.c index 8ff817b151..f4dfa1cb54 100644 --- a/3rdparty/libjpeg-turbo/src/jchuff.c +++ b/3rdparty/libjpeg-turbo/src/jchuff.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander. + * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander. * Copyright (C) 2015, Matthieu Darbois. * Copyright (C) 2018, Matthias Räncker. * Copyright (C) 2020, Arm Limited. @@ -200,12 +200,12 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics) entropy->dc_count_ptrs[dctbl] = (long *) (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 257 * sizeof(long)); - MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long)); + memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long)); if (entropy->ac_count_ptrs[actbl] == NULL) entropy->ac_count_ptrs[actbl] = (long *) (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 257 * sizeof(long)); - MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long)); + memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long)); #endif } else { /* Compute derived values for Huffman tables */ @@ -315,8 +315,8 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno, * this lets us detect duplicate VAL entries here, and later * allows emit_bits to detect any attempt to emit such symbols. */ - MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco)); - MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi)); + memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco)); + memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi)); /* This is also a convenient place to check for out-of-range * and duplicated VAL entries. We allow 0..255 for AC symbols @@ -478,7 +478,7 @@ dump_buffer(working_state *state) buffer = _buffer; \ while (bytes > 0) { \ bytestocopy = MIN(bytes, state->free_in_buffer); \ - MEMCOPY(state->next_output_byte, buffer, bytestocopy); \ + memcpy(state->next_output_byte, buffer, bytestocopy); \ state->next_output_byte += bytestocopy; \ buffer += bytestocopy; \ state->free_in_buffer -= bytestocopy; \ @@ -941,8 +941,8 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]) /* This algorithm is explained in section K.2 of the JPEG standard */ - MEMZERO(bits, sizeof(bits)); - MEMZERO(codesize, sizeof(codesize)); + memset(bits, 0, sizeof(bits)); + memset(codesize, 0, sizeof(codesize)); for (i = 0; i < 257; i++) others[i] = -1; /* init links to empty */ @@ -1044,7 +1044,7 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]) bits[i]--; /* Return final symbol counts (only for lengths 0..16) */ - MEMCOPY(htbl->bits, bits, sizeof(htbl->bits)); + memcpy(htbl->bits, bits, sizeof(htbl->bits)); /* Return a list of the symbols sorted by code length */ /* It's not real clear to me why we don't need to consider the codelength @@ -1083,8 +1083,8 @@ finish_pass_gather(j_compress_ptr cinfo) /* It's important not to apply jpeg_gen_optimal_table more than once * per table, because it clobbers the input frequency counts! */ - MEMZERO(did_dc, sizeof(did_dc)); - MEMZERO(did_ac, sizeof(did_ac)); + memset(did_dc, 0, sizeof(did_dc)); + memset(did_ac, 0, sizeof(did_ac)); for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; diff --git a/3rdparty/libjpeg-turbo/src/jcphuff.c b/3rdparty/libjpeg-turbo/src/jcphuff.c index 1101987180..872e570bff 100644 --- a/3rdparty/libjpeg-turbo/src/jcphuff.c +++ b/3rdparty/libjpeg-turbo/src/jcphuff.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1995-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander. + * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander. * Copyright (C) 2016, 2018, Matthieu Darbois. * Copyright (C) 2020, Arm Limited. * Copyright (C) 2021, Alex Richardson. @@ -275,7 +275,7 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics) entropy->count_ptrs[tbl] = (long *) (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 257 * sizeof(long)); - MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long)); + memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long)); } else { /* Compute derived values for Huffman table */ /* We may do this more than once for a table, but it's not expensive */ @@ -584,8 +584,8 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data) continue; \ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \ temp2 ^= temp; \ - values[k] = temp; \ - values[k + DCTSIZE2] = temp2; \ + values[k] = (JCOEF)temp; \ + values[k + DCTSIZE2] = (JCOEF)temp2; \ zerobits |= ((size_t)1U) << k; \ } \ } @@ -1062,7 +1062,7 @@ finish_pass_gather_phuff(j_compress_ptr cinfo) /* It's important not to apply jpeg_gen_optimal_table more than once * per table, because it clobbers the input frequency counts! */ - MEMZERO(did, sizeof(did)); + memset(did, 0, sizeof(did)); for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; diff --git a/3rdparty/libjpeg-turbo/src/jcprepct.c b/3rdparty/libjpeg-turbo/src/jcprepct.c index d59713ae68..f27cc34507 100644 --- a/3rdparty/libjpeg-turbo/src/jcprepct.c +++ b/3rdparty/libjpeg-turbo/src/jcprepct.c @@ -3,8 +3,8 @@ * * This file is part of the Independent JPEG Group's software: * Copyright (C) 1994-1996, Thomas G. Lane. - * It was modified by The libjpeg-turbo Project to include only code relevant - * to libjpeg-turbo. + * libjpeg-turbo Modifications: + * Copyright (C) 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -289,8 +289,8 @@ create_context_buffer(j_compress_ptr cinfo) cinfo->max_h_samp_factor) / compptr->h_samp_factor), (JDIMENSION)(3 * rgroup_height)); /* Copy true buffer row pointers into the middle of the fake row array */ - MEMCOPY(fake_buffer + rgroup_height, true_buffer, - 3 * rgroup_height * sizeof(JSAMPROW)); + memcpy(fake_buffer + rgroup_height, true_buffer, + 3 * rgroup_height * sizeof(JSAMPROW)); /* Fill in the above and below wraparound pointers */ for (i = 0; i < rgroup_height; i++) { fake_buffer[i] = true_buffer[2 * rgroup_height + i]; diff --git a/3rdparty/libjpeg-turbo/src/jctrans.c b/3rdparty/libjpeg-turbo/src/jctrans.c index ab6a2186db..e121028ec7 100644 --- a/3rdparty/libjpeg-turbo/src/jctrans.c +++ b/3rdparty/libjpeg-turbo/src/jctrans.c @@ -5,7 +5,7 @@ * Copyright (C) 1995-1998, Thomas G. Lane. * Modified 2000-2009 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2020, D. R. Commander. + * Copyright (C) 2020, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -100,8 +100,8 @@ jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo) qtblptr = &dstinfo->quant_tbl_ptrs[tblno]; if (*qtblptr == NULL) *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo); - MEMCOPY((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval, - sizeof((*qtblptr)->quantval)); + memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval, + sizeof((*qtblptr)->quantval)); (*qtblptr)->sent_table = FALSE; } } diff --git a/3rdparty/libjpeg-turbo/src/jdapimin.c b/3rdparty/libjpeg-turbo/src/jdapimin.c index 4609b1322f..f50c27edc3 100644 --- a/3rdparty/libjpeg-turbo/src/jdapimin.c +++ b/3rdparty/libjpeg-turbo/src/jdapimin.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1994-1998, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2016, D. R. Commander. + * Copyright (C) 2016, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -53,7 +53,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize) { struct jpeg_error_mgr *err = cinfo->err; void *client_data = cinfo->client_data; /* ignore Purify complaint here */ - MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct)); + memset(cinfo, 0, sizeof(struct jpeg_decompress_struct)); cinfo->err = err; cinfo->client_data = client_data; } @@ -92,7 +92,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize) cinfo->master = (struct jpeg_decomp_master *) (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT, sizeof(my_decomp_master)); - MEMZERO(cinfo->master, sizeof(my_decomp_master)); + memset(cinfo->master, 0, sizeof(my_decomp_master)); } diff --git a/3rdparty/libjpeg-turbo/src/jdapistd.c b/3rdparty/libjpeg-turbo/src/jdapistd.c index 695a620099..8827d8abf5 100644 --- a/3rdparty/libjpeg-turbo/src/jdapistd.c +++ b/3rdparty/libjpeg-turbo/src/jdapistd.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1994-1996, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2010, 2015-2020, D. R. Commander. + * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander. * Copyright (C) 2015, Google, Inc. * For conditions of distribution and use, see the accompanying README.ijg * file. @@ -159,6 +159,7 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset, JDIMENSION input_xoffset; boolean reinit_upsampler = FALSE; jpeg_component_info *compptr; + my_master_ptr master = (my_master_ptr)cinfo->master; if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0) ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); @@ -208,6 +209,11 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset, */ *width = *width + input_xoffset - *xoffset; cinfo->output_width = *width; + if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) { + my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample; + upsample->out_row_width = + cinfo->output_width * cinfo->out_color_components; + } /* Set the first and last iMCU columns that we must decompress. These values * will be used in single-scan decompressions. diff --git a/3rdparty/libjpeg-turbo/src/jdarith.c b/3rdparty/libjpeg-turbo/src/jdarith.c index 7f0d3a785c..21575e80c7 100644 --- a/3rdparty/libjpeg-turbo/src/jdarith.c +++ b/3rdparty/libjpeg-turbo/src/jdarith.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Developed 1997-2015 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2015-2020, D. R. Commander. + * Copyright (C) 2015-2020, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -210,13 +210,13 @@ process_restart(j_decompress_ptr cinfo) for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) { - MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS); + memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS); /* Reset DC predictions to 0 */ entropy->last_dc_val[ci] = 0; entropy->dc_context[ci] = 0; } if (!cinfo->progressive_mode || cinfo->Ss) { - MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS); + memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS); } } @@ -471,17 +471,17 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (*thiscoef) { /* previously nonzero coef */ if (arith_decode(cinfo, st + 2)) { if (*thiscoef < 0) - *thiscoef += m1; + *thiscoef += (JCOEF)m1; else - *thiscoef += p1; + *thiscoef += (JCOEF)p1; } break; } if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */ if (arith_decode(cinfo, entropy->fixed_bin)) - *thiscoef = m1; + *thiscoef = (JCOEF)m1; else - *thiscoef = p1; + *thiscoef = (JCOEF)p1; break; } st += 3; k++; @@ -698,8 +698,8 @@ bad: /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG. * This ought to be an error condition, but we make it a warning. */ - if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 || - (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1)) + if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 || + cinfo->Ah != 0 || cinfo->Al != 0) WARNMS(cinfo, JWRN_NOT_SEQUENTIAL); /* Select MCU decoding routine */ entropy->pub.decode_mcu = decode_mcu; @@ -715,7 +715,7 @@ bad: if (entropy->dc_stats[tbl] == NULL) entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS); - MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); + memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS); /* Initialize DC predictions to 0 */ entropy->last_dc_val[ci] = 0; entropy->dc_context[ci] = 0; @@ -727,7 +727,7 @@ bad: if (entropy->ac_stats[tbl] == NULL) entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS); - MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); + memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS); } } diff --git a/3rdparty/libjpeg-turbo/src/jdatadst.c b/3rdparty/libjpeg-turbo/src/jdatadst.c index 246fffb58a..6b4fed2339 100644 --- a/3rdparty/libjpeg-turbo/src/jdatadst.c +++ b/3rdparty/libjpeg-turbo/src/jdatadst.c @@ -5,7 +5,7 @@ * Copyright (C) 1994-1996, Thomas G. Lane. * Modified 2009-2012 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2013, 2016, D. R. Commander. + * Copyright (C) 2013, 2016, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -23,11 +23,6 @@ #include "jpeglib.h" #include "jerror.h" -#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ -extern void *malloc(size_t size); -extern void free(void *ptr); -#endif - /* Expanded data destination object for stdio output */ @@ -116,7 +111,7 @@ empty_output_buffer(j_compress_ptr cinfo) { my_dest_ptr dest = (my_dest_ptr)cinfo->dest; - if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) != + if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) != (size_t)OUTPUT_BUF_SIZE) ERREXIT(cinfo, JERR_FILE_WRITE); @@ -141,7 +136,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo) if (nextbuffer == NULL) ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10); - MEMCOPY(nextbuffer, dest->buffer, dest->bufsize); + memcpy(nextbuffer, dest->buffer, dest->bufsize); free(dest->newbuffer); @@ -175,7 +170,7 @@ term_destination(j_compress_ptr cinfo) /* Write any data remaining in the buffer */ if (datacount > 0) { - if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount) + if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount) ERREXIT(cinfo, JERR_FILE_WRITE); } fflush(dest->outfile); diff --git a/3rdparty/libjpeg-turbo/src/jdatasrc.c b/3rdparty/libjpeg-turbo/src/jdatasrc.c index eadb4a2c90..e36a30d894 100644 --- a/3rdparty/libjpeg-turbo/src/jdatasrc.c +++ b/3rdparty/libjpeg-turbo/src/jdatasrc.c @@ -5,7 +5,7 @@ * Copyright (C) 1994-1996, Thomas G. Lane. * Modified 2009-2011 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2013, 2016, D. R. Commander. + * Copyright (C) 2013, 2016, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -104,7 +104,7 @@ fill_input_buffer(j_decompress_ptr cinfo) my_src_ptr src = (my_src_ptr)cinfo->src; size_t nbytes; - nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE); + nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile); if (nbytes <= 0) { if (src->start_of_file) /* Treat empty input file as fatal error */ diff --git a/3rdparty/libjpeg-turbo/src/jddctmgr.c b/3rdparty/libjpeg-turbo/src/jddctmgr.c index 266f446623..e78d7bebe2 100644 --- a/3rdparty/libjpeg-turbo/src/jddctmgr.c +++ b/3rdparty/libjpeg-turbo/src/jddctmgr.c @@ -6,7 +6,7 @@ * Modified 2002-2010 by Guido Vollbeding. * libjpeg-turbo Modifications: * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright (C) 2010, 2015, D. R. Commander. + * Copyright (C) 2010, 2015, 2022, D. R. Commander. * Copyright (C) 2013, MIPS Technologies, Inc., California. * For conditions of distribution and use, see the accompanying README.ijg * file. @@ -345,7 +345,7 @@ jinit_inverse_dct(j_decompress_ptr cinfo) compptr->dct_table = (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, sizeof(multiplier_table)); - MEMZERO(compptr->dct_table, sizeof(multiplier_table)); + memset(compptr->dct_table, 0, sizeof(multiplier_table)); /* Mark multiplier table not yet set up for any method */ idct->cur_method[ci] = -1; } diff --git a/3rdparty/libjpeg-turbo/src/jdicc.c b/3rdparty/libjpeg-turbo/src/jdicc.c index a1a5b867ae..50aa9a9676 100644 --- a/3rdparty/libjpeg-turbo/src/jdicc.c +++ b/3rdparty/libjpeg-turbo/src/jdicc.c @@ -18,10 +18,6 @@ #include "jpeglib.h" #include "jerror.h" -#ifndef HAVE_STDLIB_H /* should declare malloc() */ -extern void *malloc(size_t size); -#endif - #define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */ #define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */ diff --git a/3rdparty/libjpeg-turbo/src/jdinput.c b/3rdparty/libjpeg-turbo/src/jdinput.c index deec618f26..1bc5aff1a7 100644 --- a/3rdparty/libjpeg-turbo/src/jdinput.c +++ b/3rdparty/libjpeg-turbo/src/jdinput.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2010, 2016, 2018, D. R. Commander. + * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander. * Copyright (C) 2015, Google, Inc. * For conditions of distribution and use, see the accompanying README.ijg * file. @@ -264,7 +264,7 @@ latch_quant_tables(j_decompress_ptr cinfo) qtbl = (JQUANT_TBL *) (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, sizeof(JQUANT_TBL)); - MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL)); + memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL)); compptr->quant_table = qtbl; } } diff --git a/3rdparty/libjpeg-turbo/src/jdmarker.c b/3rdparty/libjpeg-turbo/src/jdmarker.c index b964c3a1a6..f7eba615fd 100644 --- a/3rdparty/libjpeg-turbo/src/jdmarker.c +++ b/3rdparty/libjpeg-turbo/src/jdmarker.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1998, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2012, 2015, D. R. Commander. + * Copyright (C) 2012, 2015, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -473,7 +473,7 @@ get_dht(j_decompress_ptr cinfo) for (i = 0; i < count; i++) INPUT_BYTE(cinfo, huffval[i], return FALSE); - MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8)); + memset(&huffval[count], 0, (256 - count) * sizeof(UINT8)); length -= count; @@ -491,8 +491,8 @@ get_dht(j_decompress_ptr cinfo) if (*htblptr == NULL) *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo); - MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); - MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval)); + memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); + memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval)); } if (length != 0) diff --git a/3rdparty/libjpeg-turbo/src/jdmaster.c b/3rdparty/libjpeg-turbo/src/jdmaster.c index cbc8774b1f..a3690bf560 100644 --- a/3rdparty/libjpeg-turbo/src/jdmaster.c +++ b/3rdparty/libjpeg-turbo/src/jdmaster.c @@ -5,7 +5,7 @@ * Copyright (C) 1991-1997, Thomas G. Lane. * Modified 2002-2009 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander. + * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander. * Copyright (C) 2013, Linaro Limited. * Copyright (C) 2015, Google, Inc. * For conditions of distribution and use, see the accompanying README.ijg @@ -417,7 +417,7 @@ prepare_range_limit_table(j_decompress_ptr cinfo) table += (MAXJSAMPLE + 1); /* allow negative subscripts of simple table */ cinfo->sample_range_limit = table; /* First segment of "simple" table: limit[x] = 0 for x < 0 */ - MEMZERO(table - (MAXJSAMPLE + 1), (MAXJSAMPLE + 1) * sizeof(JSAMPLE)); + memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE)); /* Main part of "simple" table: limit[x] = x */ for (i = 0; i <= MAXJSAMPLE; i++) table[i] = (JSAMPLE)i; @@ -426,10 +426,10 @@ prepare_range_limit_table(j_decompress_ptr cinfo) for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++) table[i] = MAXJSAMPLE; /* Second half of post-IDCT table */ - MEMZERO(table + (2 * (MAXJSAMPLE + 1)), - (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE)); - MEMCOPY(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE), - cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE)); + memset(table + (2 * (MAXJSAMPLE + 1)), 0, + (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE)); + memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE), + cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE)); } diff --git a/3rdparty/libjpeg-turbo/src/jdphuff.c b/3rdparty/libjpeg-turbo/src/jdphuff.c index c6d82ca14b..9680ebcbd0 100644 --- a/3rdparty/libjpeg-turbo/src/jdphuff.c +++ b/3rdparty/libjpeg-turbo/src/jdphuff.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1995-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2015-2016, 2018-2021, D. R. Commander. + * Copyright (C) 2015-2016, 2018-2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -578,9 +578,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (GET_BITS(1)) { if ((*thiscoef & p1) == 0) { /* do nothing if already set it */ if (*thiscoef >= 0) - *thiscoef += p1; + *thiscoef += (JCOEF)p1; else - *thiscoef += m1; + *thiscoef += (JCOEF)m1; } } } else { @@ -612,9 +612,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (GET_BITS(1)) { if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */ if (*thiscoef >= 0) - *thiscoef += p1; + *thiscoef += (JCOEF)p1; else - *thiscoef += m1; + *thiscoef += (JCOEF)m1; } } } diff --git a/3rdparty/libjpeg-turbo/src/jerror.c b/3rdparty/libjpeg-turbo/src/jerror.c index 936c4f5d80..d544702937 100644 --- a/3rdparty/libjpeg-turbo/src/jerror.c +++ b/3rdparty/libjpeg-turbo/src/jerror.c @@ -3,8 +3,8 @@ * * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1998, Thomas G. Lane. - * It was modified by The libjpeg-turbo Project to include only code relevant - * to libjpeg-turbo. + * libjpeg-turbo Modifications: + * Copyright (C) 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -189,13 +189,13 @@ format_message(j_common_ptr cinfo, char *buffer) /* Format the message into the passed buffer */ if (isstring) - sprintf(buffer, msgtext, err->msg_parm.s); + snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s); else - sprintf(buffer, msgtext, - err->msg_parm.i[0], err->msg_parm.i[1], - err->msg_parm.i[2], err->msg_parm.i[3], - err->msg_parm.i[4], err->msg_parm.i[5], - err->msg_parm.i[6], err->msg_parm.i[7]); + snprintf(buffer, JMSG_LENGTH_MAX, msgtext, + err->msg_parm.i[0], err->msg_parm.i[1], + err->msg_parm.i[2], err->msg_parm.i[3], + err->msg_parm.i[4], err->msg_parm.i[5], + err->msg_parm.i[6], err->msg_parm.i[7]); } diff --git a/3rdparty/libjpeg-turbo/src/jerror.h b/3rdparty/libjpeg-turbo/src/jerror.h index 4476df2c93..eb44a1140a 100644 --- a/3rdparty/libjpeg-turbo/src/jerror.h +++ b/3rdparty/libjpeg-turbo/src/jerror.h @@ -5,7 +5,7 @@ * Copyright (C) 1994-1997, Thomas G. Lane. * Modified 1997-2009 by Guido Vollbeding. * libjpeg-turbo Modifications: - * Copyright (C) 2014, 2017, D. R. Commander. + * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -103,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE, "Cannot transcode due to multiple use of quantization table %d") JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data") JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change") -JMESSAGE(JERR_NOTIMPL, "Not implemented yet") +JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible") JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time") #if JPEG_LIB_VERSION >= 70 JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined") @@ -268,6 +268,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING, #define ERREXITS(cinfo, code, str) \ ((cinfo)->err->msg_code = (code), \ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ + (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo))) #define MAKESTMT(stuff) do { stuff } while (0) @@ -324,6 +325,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING, #define TRACEMSS(cinfo, lvl, code, str) \ ((cinfo)->err->msg_code = (code), \ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ + (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl))) #endif /* JERROR_H */ diff --git a/3rdparty/libjpeg-turbo/src/jinclude.h b/3rdparty/libjpeg-turbo/src/jinclude.h index c1bcf7d9da..120614b25c 100644 --- a/3rdparty/libjpeg-turbo/src/jinclude.h +++ b/3rdparty/libjpeg-turbo/src/jinclude.h @@ -3,8 +3,8 @@ * * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1994, Thomas G. Lane. - * It was modified by The libjpeg-turbo Project to include only code relevant - * to libjpeg-turbo. + * libjpeg-turbo Modifications: + * Copyright (C) 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -17,72 +17,117 @@ * JPEG library. Most applications need only include jpeglib.h. */ +#ifndef __JINCLUDE_H__ +#define __JINCLUDE_H__ /* Include auto-config file to find out which system include files we need. */ #include "jconfig.h" /* auto configuration options */ +#include "jconfigint.h" #define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */ /* - * We need the NULL macro and size_t typedef. - * On an ANSI-conforming system it is sufficient to include . - * Otherwise, we get them from or ; we may have to - * pull in as well. * Note that the core JPEG library does not require ; * only the default error handler and data source/destination modules do. * But we must pull it in because of the references to FILE in jpeglib.h. * You can remove those references if you want to compile without . */ -#ifdef HAVE_STDDEF_H #include -#endif - -#ifdef HAVE_STDLIB_H #include -#endif - -#ifdef NEED_SYS_TYPES_H -#include -#endif - #include - -/* - * We need memory copying and zeroing functions, plus strncpy(). - * ANSI and System V implementations declare these in . - * BSD doesn't have the mem() functions, but it does have bcopy()/bzero(). - * Some systems may declare memset and memcpy in . - * - * NOTE: we assume the size parameters to these functions are of type size_t. - * Change the casts in these macros if not! - */ - -#ifdef NEED_BSD_STRINGS - -#include -#define MEMZERO(target, size) \ - bzero((void *)(target), (size_t)(size)) -#define MEMCOPY(dest, src, size) \ - bcopy((const void *)(src), (void *)(dest), (size_t)(size)) - -#else /* not BSD, assume ANSI/SysV string lib */ - #include -#define MEMZERO(target, size) \ - memset((void *)(target), 0, (size_t)(size)) -#define MEMCOPY(dest, src, size) \ - memcpy((void *)(dest), (const void *)(src), (size_t)(size)) - -#endif /* - * The modules that use fread() and fwrite() always invoke them through - * these macros. On some systems you may need to twiddle the argument casts. - * CAUTION: argument order is different from underlying functions! + * These macros/inline functions facilitate using Microsoft's "safe string" + * functions with Visual Studio builds without the need to scatter #ifdefs + * throughout the code base. */ -#define JFREAD(file, buf, sizeofbuf) \ - ((size_t)fread((void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file))) -#define JFWRITE(file, buf, sizeofbuf) \ - ((size_t)fwrite((const void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file))) + +#ifndef NO_GETENV + +#ifdef _MSC_VER + +static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name) +{ + size_t required_size; + + return (int)getenv_s(&required_size, buffer, buffer_size, name); +} + +#else /* _MSC_VER */ + +#include + +/* This provides a similar interface to the Microsoft/C11 getenv_s() function, + * but other than parameter validation, it has no advantages over getenv(). + */ + +static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name) +{ + char *env; + + if (!buffer) { + if (buffer_size == 0) + return 0; + else + return (errno = EINVAL); + } + if (buffer_size == 0) + return (errno = EINVAL); + if (!name) { + *buffer = 0; + return 0; + } + + env = getenv(name); + if (!env) + { + *buffer = 0; + return 0; + } + + if (strlen(env) + 1 > buffer_size) { + *buffer = 0; + return ERANGE; + } + + strncpy(buffer, env, buffer_size); + + return 0; +} + +#endif /* _MSC_VER */ + +#endif /* NO_GETENV */ + + +#ifndef NO_PUTENV + +#ifdef _WIN32 + +#define PUTENV_S(name, value) _putenv_s(name, value) + +#else + +/* This provides a similar interface to the Microsoft _putenv_s() function, but + * other than parameter validation, it has no advantages over setenv(). + */ + +static INLINE int PUTENV_S(const char *name, const char *value) +{ + if (!name || !value) + return (errno = EINVAL); + + setenv(name, value, 1); + + return errno; +} + +#endif /* _WIN32 */ + +#endif /* NO_PUTENV */ + + +#endif /* JINCLUDE_H */ diff --git a/3rdparty/libjpeg-turbo/src/jmemmgr.c b/3rdparty/libjpeg-turbo/src/jmemmgr.c index 70b8ec0c49..8f5a4ab1c7 100644 --- a/3rdparty/libjpeg-turbo/src/jmemmgr.c +++ b/3rdparty/libjpeg-turbo/src/jmemmgr.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1997, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2016, 2021, D. R. Commander. + * Copyright (C) 2016, 2021-2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -37,12 +37,6 @@ #endif #include -#ifndef NO_GETENV -#ifndef HAVE_STDLIB_H /* should declare getenv() */ -extern char *getenv(const char *name); -#endif -#endif - LOCAL(size_t) round_up_pow2(size_t a, size_t b) @@ -1162,12 +1156,16 @@ jinit_memory_mgr(j_common_ptr cinfo) */ #ifndef NO_GETENV { - char *memenv; + char memenv[30] = { 0 }; - if ((memenv = getenv("JPEGMEM")) != NULL) { + if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) { char ch = 'x'; +#ifdef _MSC_VER + if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) { +#else if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) { +#endif if (ch == 'm' || ch == 'M') max_to_use *= 1000L; mem->pub.max_memory_to_use = max_to_use * 1000L; diff --git a/3rdparty/libjpeg-turbo/src/jmemnobs.c b/3rdparty/libjpeg-turbo/src/jmemnobs.c index 089be8f500..cd6571ba1c 100644 --- a/3rdparty/libjpeg-turbo/src/jmemnobs.c +++ b/3rdparty/libjpeg-turbo/src/jmemnobs.c @@ -22,11 +22,6 @@ #include "jpeglib.h" #include "jmemsys.h" /* import the system-dependent declarations */ -#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ -extern void *malloc(size_t size); -extern void free(void *ptr); -#endif - /* * Memory allocation and freeing are controlled by the regular library diff --git a/3rdparty/libjpeg-turbo/src/jmorecfg.h b/3rdparty/libjpeg-turbo/src/jmorecfg.h index fb3a9cf411..b33a991914 100644 --- a/3rdparty/libjpeg-turbo/src/jmorecfg.h +++ b/3rdparty/libjpeg-turbo/src/jmorecfg.h @@ -100,11 +100,7 @@ typedef unsigned char UINT8; /* UINT16 must hold at least the values 0..65535. */ -#ifdef HAVE_UNSIGNED_SHORT typedef unsigned short UINT16; -#else /* not HAVE_UNSIGNED_SHORT */ -typedef unsigned int UINT16; -#endif /* HAVE_UNSIGNED_SHORT */ /* INT16 must hold at least the values -32768..32767. */ diff --git a/3rdparty/libjpeg-turbo/src/jpegint.h b/3rdparty/libjpeg-turbo/src/jpegint.h index 8c8534793a..6af9e2a179 100644 --- a/3rdparty/libjpeg-turbo/src/jpegint.h +++ b/3rdparty/libjpeg-turbo/src/jpegint.h @@ -373,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */ /* Arithmetic coding probability estimation tables in jaricom.c */ extern const JLONG jpeg_aritab[]; - -/* Suppress undefined-structure complaints if necessary. */ - -#ifdef INCOMPLETE_TYPES_BROKEN -#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */ -struct jvirt_sarray_control { long dummy; }; -struct jvirt_barray_control { long dummy; }; -#endif -#endif /* INCOMPLETE_TYPES_BROKEN */ diff --git a/3rdparty/libjpeg-turbo/src/jstdhuff.c b/3rdparty/libjpeg-turbo/src/jstdhuff.c index 036d6495a5..345b513d4d 100644 --- a/3rdparty/libjpeg-turbo/src/jstdhuff.c +++ b/3rdparty/libjpeg-turbo/src/jstdhuff.c @@ -4,7 +4,7 @@ * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1998, Thomas G. Lane. * libjpeg-turbo Modifications: - * Copyright (C) 2013, D. R. Commander. + * Copyright (C) 2013, 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -29,7 +29,7 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits, return; /* Copy the number-of-symbols-of-each-code-length counts */ - MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); + memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); /* Validate the counts. We do this here mainly so we can copy the right * number of symbols from the val[] array, without risking marching off @@ -41,8 +41,9 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits, if (nsymbols < 1 || nsymbols > 256) ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); - MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8)); - MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8)); + memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8)); + memset(&((*htblptr)->huffval[nsymbols]), 0, + (256 - nsymbols) * sizeof(UINT8)); /* Initialize sent_table FALSE so table will be written to JPEG file. */ (*htblptr)->sent_table = FALSE; diff --git a/3rdparty/libjpeg-turbo/src/jutils.c b/3rdparty/libjpeg-turbo/src/jutils.c index 5c5bb17dc5..d86271624a 100644 --- a/3rdparty/libjpeg-turbo/src/jutils.c +++ b/3rdparty/libjpeg-turbo/src/jutils.c @@ -3,8 +3,8 @@ * * This file was part of the Independent JPEG Group's software: * Copyright (C) 1991-1996, Thomas G. Lane. - * It was modified by The libjpeg-turbo Project to include only code - * relevant to libjpeg-turbo. + * libjpeg-turbo Modifications: + * Copyright (C) 2022, D. R. Commander. * For conditions of distribution and use, see the accompanying README.ijg * file. * @@ -110,7 +110,7 @@ jcopy_sample_rows(JSAMPARRAY input_array, int source_row, for (row = num_rows; row > 0; row--) { inptr = *input_array++; outptr = *output_array++; - MEMCOPY(outptr, inptr, count); + memcpy(outptr, inptr, count); } } @@ -120,7 +120,7 @@ jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row, JDIMENSION num_blocks) /* Copy a row of coefficient blocks from one place to another. */ { - MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF))); + memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF))); } @@ -129,5 +129,5 @@ jzero_far(void *target, size_t bytestozero) /* Zero out a chunk of memory. */ /* This might be sample-array data, block-array data, or alloc_large data. */ { - MEMZERO(target, bytestozero); + memset(target, 0, bytestozero); } diff --git a/3rdparty/libjpeg-turbo/src/jversion.h.in b/3rdparty/libjpeg-turbo/src/jversion.h.in new file mode 100644 index 0000000000..dca4f08fdb --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/jversion.h.in @@ -0,0 +1,54 @@ +/* + * jversion.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2012-2022, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains software version identification. + */ + + +#if JPEG_LIB_VERSION >= 80 + +#define JVERSION "8d 15-Jan-2012" + +#elif JPEG_LIB_VERSION >= 70 + +#define JVERSION "7 27-Jun-2009" + +#else + +#define JVERSION "6b 27-Mar-1998" + +#endif + +/* + * NOTE: It is our convention to place the authors in the following order: + * - libjpeg-turbo authors (2009-) in descending order of the date of their + * most recent contribution to the project, then in ascending order of the + * date of their first contribution to the project, then in alphabetical + * order + * - Upstream authors in descending order of the date of the first inclusion of + * their code + */ + +#define JCOPYRIGHT \ + "Copyright (C) 2009-2022 D. R. Commander\n" \ + "Copyright (C) 2015, 2020 Google, Inc.\n" \ + "Copyright (C) 2019-2020 Arm Limited\n" \ + "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \ + "Copyright (C) 2011-2016 Siarhei Siamashka\n" \ + "Copyright (C) 2015 Intel Corporation\n" \ + "Copyright (C) 2013-2014 Linaro Limited\n" \ + "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \ + "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \ + "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \ + "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \ + "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding" + +#define JCOPYRIGHT_SHORT \ + "Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others" diff --git a/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt new file mode 100644 index 0000000000..8521e42b44 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt @@ -0,0 +1,540 @@ +macro(simd_fail message) + if(REQUIRE_SIMD) + message(FATAL_ERROR "${message}.") + else() + message(WARNING "${message}. Performance will suffer.") + set(WITH_SIMD 0 PARENT_SCOPE) + endif() +endmacro() + + +############################################################################### +# x86[-64] (NASM) +############################################################################### + +if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386") + +set(CMAKE_ASM_NASM_FLAGS_DEBUG_INIT "-g") +set(CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO_INIT "-g") + +# Allow the location of the NASM executable to be specified using the ASM_NASM +# environment variable. This should happen automatically, but unfortunately +# enable_language(ASM_NASM) doesn't parse the ASM_NASM environment variable +# until after CMAKE_ASM_NASM_COMPILER has been populated with the results of +# searching for NASM or Yasm in the PATH. +if(NOT DEFINED CMAKE_ASM_NASM_COMPILER AND DEFINED ENV{ASM_NASM}) + set(CMAKE_ASM_NASM_COMPILER $ENV{ASM_NASM}) +endif() + +if(CPU_TYPE STREQUAL "x86_64") + if(CYGWIN) + set(CMAKE_ASM_NASM_OBJECT_FORMAT win64) + endif() + if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32") + set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32) + endif() +elseif(CPU_TYPE STREQUAL "i386") + if(BORLAND) + set(CMAKE_ASM_NASM_OBJECT_FORMAT obj) + elseif(CYGWIN) + set(CMAKE_ASM_NASM_OBJECT_FORMAT win32) + endif() +endif() + +if(NOT REQUIRE_SIMD) + include(CheckLanguage) + check_language(ASM_NASM) + if(NOT CMAKE_ASM_NASM_COMPILER) + simd_fail("SIMD extensions disabled: could not find NASM compiler") + return() + endif() +endif() +enable_language(ASM_NASM) +message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}") + +if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^macho") + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DMACHO") +elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^elf") + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DELF") + set(CMAKE_ASM_NASM_DEBUG_FORMAT "dwarf2") +endif() +if(CPU_TYPE STREQUAL "x86_64") + if(WIN32 OR CYGWIN) + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN64") + endif() + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D__x86_64__") +elseif(CPU_TYPE STREQUAL "i386") + if(BORLAND) + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DOBJ32") + elseif(WIN32 OR CYGWIN) + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN32") + endif() +endif() + +message(STATUS "CMAKE_ASM_NASM_OBJECT_FORMAT = ${CMAKE_ASM_NASM_OBJECT_FORMAT}") + +if(NOT CMAKE_ASM_NASM_OBJECT_FORMAT) + simd_fail("SIMD extensions disabled: could not determine NASM object format") + return() +endif() + +get_filename_component(CMAKE_ASM_NASM_COMPILER_TYPE + "${CMAKE_ASM_NASM_COMPILER}" NAME_WE) +if(CMAKE_ASM_NASM_COMPILER_TYPE MATCHES "yasm") + foreach(var CMAKE_ASM_NASM_FLAGS_DEBUG CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO) + if(${var} STREQUAL "-g") + if(CMAKE_ASM_NASM_DEBUG_FORMAT) + set_property(CACHE ${var} PROPERTY VALUE "-g ${CMAKE_ASM_NASM_DEBUG_FORMAT}") + else() + set_property(CACHE ${var} PROPERTY VALUE "") + endif() + endif() + endforeach() +endif() + +if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)) + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC") +endif() + +string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) +set(EFFECTIVE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} ${CMAKE_ASM_NASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}") +message(STATUS "CMAKE_ASM_NASM_FLAGS = ${EFFECTIVE_ASM_NASM_FLAGS}") + +set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/nasm/\" -I\"${CMAKE_CURRENT_SOURCE_DIR}/${CPU_TYPE}/\"") + +set(GREP grep) +if(CMAKE_SYSTEM_NAME STREQUAL "SunOS") + set(GREP ggrep) +endif() +add_custom_target(jsimdcfg COMMAND + ${CMAKE_C_COMPILER} -E -I${CMAKE_BINARY_DIR} -I${CMAKE_CURRENT_BINARY_DIR} + -I${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc.h | + ${GREP} -E '^[\;%]|^\ %' | sed 's%_cpp_protection_%%' | + sed 's@% define@%define@g' >${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc) + +if(CPU_TYPE STREQUAL "x86_64") + set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm + x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm + x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm + x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm + x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm + x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm + x86_64/jquanti-sse2.asm + x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm + x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm + x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm) +else() + set(SIMD_SOURCES i386/jsimdcpu.asm i386/jfdctflt-3dn.asm + i386/jidctflt-3dn.asm i386/jquant-3dn.asm + i386/jccolor-mmx.asm i386/jcgray-mmx.asm i386/jcsample-mmx.asm + i386/jdcolor-mmx.asm i386/jdmerge-mmx.asm i386/jdsample-mmx.asm + i386/jfdctfst-mmx.asm i386/jfdctint-mmx.asm i386/jidctfst-mmx.asm + i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm + i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm + i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm + i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm + i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm + i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm + i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm + i386/jquanti-sse2.asm + i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm + i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm + i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm) +endif() + +if(MSVC_IDE) + set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}") + string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}") +elseif(XCODE) + set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}") + string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}") +endif() + +file(GLOB INC_FILES nasm/*.inc) + +foreach(file ${SIMD_SOURCES}) + set(OBJECT_DEPENDS "") + if(${file} MATCHES jccolor) + string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${file}) + set(OBJECT_DEPENDS ${OBJECT_DEPENDS} + ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}) + endif() + if(${file} MATCHES jcgray) + string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${file}) + set(OBJECT_DEPENDS ${OBJECT_DEPENDS} + ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}) + endif() + if(${file} MATCHES jdcolor) + string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${file}) + set(OBJECT_DEPENDS ${OBJECT_DEPENDS} + ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}) + endif() + if(${file} MATCHES jdmerge) + string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${file}) + set(OBJECT_DEPENDS ${OBJECT_DEPENDS} + ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}) + endif() + set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES}) + if(MSVC_IDE OR XCODE) + # The CMake Visual Studio generators do not work properly with the ASM_NASM + # language, so we have to go rogue here and use a custom command like we + # did in prior versions of libjpeg-turbo. (This is why we can't have nice + # things.) + string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file}) + set(SIMD_OBJ ${OBJDIR}/${filename}${CMAKE_C_OUTPUT_EXTENSION}) + add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS} + COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT} + ${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file} + -o${SIMD_OBJ}) + set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ}) + else() + set_source_files_properties(${file} PROPERTIES OBJECT_DEPENDS + "${OBJECT_DEPENDS}") + endif() +endforeach() + +if(MSVC_IDE OR XCODE) + set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE) + add_library(simd OBJECT ${CPU_TYPE}/jsimd.c) + add_custom_target(simd-objs DEPENDS ${SIMD_OBJS}) + add_dependencies(simd simd-objs) +else() + add_library(simd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c) +endif() +if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) +endif() + + +############################################################################### +# Arm (Intrinsics or GAS) +############################################################################### + +elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") + +# If Neon instructions are not explicitly enabled at compile time (e.g. using +# -mfpu=neon) with an AArch32 Linux or Android build, then the AArch32 SIMD +# dispatcher will parse /proc/cpuinfo to determine whether the Neon SIMD +# extensions can be enabled at run time. In order to support all AArch32 CPUs +# using the same code base, i.e. to support run-time FPU and Neon +# auto-detection, it is necessary to compile the scalar C source code using +# -mfloat-abi=soft (which is usually the default) but compile the intrinsics +# implementation of the Neon SIMD extensions using -mfloat-abi=softfp. The +# following test determines whether -mfloat-abi=softfp should be explicitly +# added to the compile flags for the intrinsics implementation of the Neon SIMD +# extensions. +if(BITS EQUAL 32) + check_c_source_compiles(" + #if defined(__ARM_NEON__) || (!defined(__linux__) && !defined(ANDROID) && !defined(__ANDROID__)) + #error \"Neon run-time auto-detection will not be used\" + #endif + #if __ARM_PCS_VFP == 1 + #error \"float ABI = hard\" + #endif + #if __SOFTFP__ != 1 + #error \"float ABI = softfp\" + #endif + int main(void) { return 0; }" NEED_SOFTFP_FOR_INTRINSICS) + if(NEED_SOFTFP_FOR_INTRINSICS) + set(SOFTFP_FLAG -mfloat-abi=softfp) + endif() +endif() + +if(BITS EQUAL 32) + set(CMAKE_REQUIRED_FLAGS "-mfpu=neon ${SOFTFP_FLAG}") + check_c_source_compiles(" + #include + int main(int argc, char **argv) { + uint16x8_t input = vdupq_n_u16((uint16_t)argc); + uint8x8_t output = vmovn_u16(input); + return (int)output[0]; + }" HAVE_NEON) + if(NOT HAVE_NEON) + simd_fail("SIMD extensions not available for this architecture") + return() + endif() +endif() +check_c_source_compiles(" + #include + int main(int argc, char **argv) { + int16_t input[] = { + (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc, + (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc, + (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc + }; + int16x4x3_t output = vld1_s16_x3(input); + vst3_s16(input, output); + return (int)input[0]; + }" HAVE_VLD1_S16_X3) +check_c_source_compiles(" + #include + int main(int argc, char **argv) { + uint16_t input[] = { + (uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc, + (uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc + }; + uint16x4x2_t output = vld1_u16_x2(input); + vst2_u16(input, output); + return (int)input[0]; + }" HAVE_VLD1_U16_X2) +check_c_source_compiles(" + #include + int main(int argc, char **argv) { + uint8_t input[] = { + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, + (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc + }; + uint8x16x4_t output = vld1q_u8_x4(input); + vst4q_u8(input, output); + return (int)input[0]; + }" HAVE_VLD1Q_U8_X4) +if(BITS EQUAL 32) + unset(CMAKE_REQUIRED_FLAGS) +endif() +configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm) + +# GCC 11 and earlier and some older versions of Clang do not have a full or +# optimal set of Neon intrinsics, so for performance reasons, when using those +# compilers, we default to using the older GAS implementation of the Neon SIMD +# extensions for certain algorithms. The presence or absence of the three +# intrinsics we tested above is a reasonable proxy for this, except with GCC 10 +# and 11. +if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4 AND + (NOT CMAKE_COMPILER_IS_GNUCC OR + CMAKE_C_COMPILER_VERSION VERSION_EQUAL 12.0.0 OR + CMAKE_C_COMPILER_VERSION VERSION_GREATER 12.0.0))) + set(DEFAULT_NEON_INTRINSICS 1) +else() + set(DEFAULT_NEON_INTRINSICS 0) +endif() +option(NEON_INTRINSICS + "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms. Setting this option forces the full Neon intrinsics implementation to be used with all compilers. Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers." + ${DEFAULT_NEON_INTRINSICS}) +if(NOT NEON_INTRINSICS) + enable_language(ASM) + + set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}") + + # Test whether gas-preprocessor.pl would be needed to build the GAS + # implementation of the Neon SIMD extensions. If so, then automatically + # enable the full Neon intrinsics implementation. + if(CPU_TYPE STREQUAL "arm") + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S " + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + pld [r0] + vmovn.u16 d0, q0") + else() + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S " + .text + MYVAR .req x0 + movi v0.16b, #100 + mov MYVAR, #100 + .unreq MYVAR") + endif() + separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}") + execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP} + -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S + RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR) + if(NOT RESULT EQUAL 0) + message(WARNING "GAS appears to be broken. Using the full Neon SIMD intrinsics implementation.") + set(NEON_INTRINSICS 1 CACHE INTERNAL "" FORCE) + endif() +endif() +boolean_number(NEON_INTRINSICS PARENT_SCOPE) +if(NEON_INTRINSICS) + add_definitions(-DNEON_INTRINSICS) + message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})") +else() + message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})") +endif() + +set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c + arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c + arm/jidctred-neon.c arm/jquanti-neon.c) +if(NEON_INTRINSICS) + set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c) +endif() +if(NEON_INTRINSICS OR BITS EQUAL 64) + set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c) +endif() +if(NEON_INTRINSICS OR BITS EQUAL 32) + set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c + arm/jdcolor-neon.c arm/jfdctint-neon.c) +endif() +if(BITS EQUAL 32) + set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS "-mfpu=neon ${SOFTFP_FLAG}") +endif() +if(NOT NEON_INTRINSICS) + string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) + set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}") + message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}") + + set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S) +endif() + +add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c) + +if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) +endif() + + +############################################################################### +# MIPS (GAS) +############################################################################### + +elseif(CPU_TYPE STREQUAL "mips" OR CPU_TYPE STREQUAL "mipsel") + +enable_language(ASM) + +string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) +set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}") +message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}") + +set(CMAKE_REQUIRED_FLAGS -mdspr2) + +check_c_source_compiles(" + #if !(defined(__mips__) && __mips_isa_rev >= 2) + #error MIPS DSPr2 is currently only available on MIPS32r2 platforms. + #endif + int main(void) { + int c = 0, a = 0, b = 0; + __asm__ __volatile__ ( + \"precr.qb.ph %[c], %[a], %[b]\" + : [c] \"=r\" (c) + : [a] \"r\" (a), [b] \"r\" (b) + ); + return c; + }" HAVE_DSPR2) + +unset(CMAKE_REQUIRED_FLAGS) + +if(NOT HAVE_DSPR2) + simd_fail("SIMD extensions not available for this CPU") + return() +endif() + +add_library(simd OBJECT mips/jsimd_dspr2.S mips/jsimd.c) + +if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) +endif() + +############################################################################### +# MIPS64 (Intrinsics) +############################################################################### + +elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "^mips64") + +set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext) + +check_c_source_compiles(" + int main(void) { + int c = 0, a = 0, b = 0; + asm ( + \"paddb %0, %1, %2\" + : \"=f\" (c) + : \"f\" (a), \"f\" (b) + ); + return c; + }" HAVE_MMI) + +unset(CMAKE_REQUIRED_FLAGS) + +if(NOT HAVE_MMI) + simd_fail("SIMD extensions not available for this CPU") + return() +endif() + +set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c + mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c + mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c + mips64/jidctint-mmi.c mips64/jquanti-mmi.c) + +if(CMAKE_COMPILER_IS_GNUCC) + foreach(file ${SIMD_SOURCES}) + set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS + " -fno-strict-aliasing") + endforeach() +endif() +foreach(file ${SIMD_SOURCES}) + set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS + " -Wa,-mloongson-mmi,-mloongson-ext") +endforeach() + +add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c) + +if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) +endif() + +############################################################################### +# PowerPC (Intrinsics) +############################################################################### + +elseif(CPU_TYPE STREQUAL "powerpc") + +set(CMAKE_REQUIRED_FLAGS -maltivec) + +check_c_source_compiles(" + #include + int main(void) { + __vector int vi = { 0, 0, 0, 0 }; + int i[4]; + vec_st(vi, 0, i); + return i[0]; + }" HAVE_ALTIVEC) + +unset(CMAKE_REQUIRED_FLAGS) + +if(NOT HAVE_ALTIVEC) + simd_fail("SIMD extensions not available for this CPU (PowerPC SPE)") + return() +endif() + +set(SIMD_SOURCES powerpc/jccolor-altivec.c powerpc/jcgray-altivec.c + powerpc/jcsample-altivec.c powerpc/jdcolor-altivec.c + powerpc/jdmerge-altivec.c powerpc/jdsample-altivec.c + powerpc/jfdctfst-altivec.c powerpc/jfdctint-altivec.c + powerpc/jidctfst-altivec.c powerpc/jidctint-altivec.c + powerpc/jquanti-altivec.c) + +set_source_files_properties(${SIMD_SOURCES} PROPERTIES + COMPILE_FLAGS -maltivec) + +add_library(simd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c) + +if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) +endif() + + +############################################################################### +# None +############################################################################### + +else() + +simd_fail("SIMD extensions not available for this CPU (${CMAKE_SYSTEM_PROCESSOR})") + +endif() # CPU_TYPE diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c new file mode 100644 index 0000000000..362102d2b2 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c @@ -0,0 +1,148 @@ +/* + * jccolext-neon.c - colorspace conversion (32-bit Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-neon.c */ + + +/* RGB -> YCbCr conversion is defined by the following equations: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * 0.16874695 = 11059 * 2^-16 + * 0.33125305 = 21709 * 2^-16 + * 0.50000000 = 32768 * 2^-16 + * 0.41868592 = 27439 * 2^-16 + * 0.08131409 = 5329 * 2^-16 + * These constants are defined in jccolor-neon.c + * + * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively + * rounds up or down the result via integer truncation. + */ + +void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + /* Pointer to RGB(X/A) input data */ + JSAMPROW inptr; + /* Pointers to Y, Cb, and Cr output data */ + JSAMPROW outptr0, outptr1, outptr2; + /* Allocate temporary buffer for final (image_width % 8) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; + + /* Set up conversion constants. */ +#ifdef HAVE_VLD1_U16_X2 + const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts); +#else + /* GCC does not currently support the intrinsic vld1__x2(). */ + const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts); + const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4); + const uint16x4x2_t consts = { { consts1, consts2 } }; +#endif + const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767); + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining > 0; cols_remaining -= 8) { + + /* To prevent buffer overread by the vector load instructions, the last + * (image_width % 8) columns of data are first memcopied to a temporary + * buffer large enough to accommodate the vector load. + */ + if (cols_remaining < 8) { + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + } + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t input_pixels = vld4_u8(inptr); +#else + uint8x8x3_t input_pixels = vld3_u8(inptr); +#endif + uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]); + uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]); + uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0); + y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1); + y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2); + uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0); + y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1); + y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_low = scaled_128_5; + cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3); + cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0); + cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1); + uint32x4_t cb_high = scaled_128_5; + cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3); + cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0); + cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_low = scaled_128_5; + cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1); + cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2); + cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3); + uint32x4_t cr_high = scaled_128_5; + cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1); + cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2); + cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16), + vrshrn_n_u32(y_high, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16), + vshrn_n_u32(cb_high, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16), + vshrn_n_u32(cr_high, 16)); + /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer + * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. + */ + vst1_u8(outptr0, vmovn_u16(y_u16)); + vst1_u8(outptr1, vmovn_u16(cb_u16)); + vst1_u8(outptr2, vmovn_u16(cr_u16)); + + /* Increment pointers. */ + inptr += (8 * RGB_PIXELSIZE); + outptr0 += 8; + outptr1 += 8; + outptr2 += 8; + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c new file mode 100644 index 0000000000..19d94f720d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c @@ -0,0 +1,334 @@ +/* + * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * NOTE: All referenced figures are from + * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" +#include "../jchuff.h" +#include "neon-compat.h" + +#include + +#include + + +JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, + JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + uint8_t block_nbits[DCTSIZE2]; + uint16_t block_diff[DCTSIZE2]; + + /* Load rows of coefficients from DCT block in zig-zag order. */ + + /* Compute DC coefficient difference value. (F.1.1.5.1) */ + int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val); + row0 = vld1q_lane_s16(block + 1, row0, 1); + row0 = vld1q_lane_s16(block + 8, row0, 2); + row0 = vld1q_lane_s16(block + 16, row0, 3); + row0 = vld1q_lane_s16(block + 9, row0, 4); + row0 = vld1q_lane_s16(block + 2, row0, 5); + row0 = vld1q_lane_s16(block + 3, row0, 6); + row0 = vld1q_lane_s16(block + 10, row0, 7); + + int16x8_t row1 = vld1q_dup_s16(block + 17); + row1 = vld1q_lane_s16(block + 24, row1, 1); + row1 = vld1q_lane_s16(block + 32, row1, 2); + row1 = vld1q_lane_s16(block + 25, row1, 3); + row1 = vld1q_lane_s16(block + 18, row1, 4); + row1 = vld1q_lane_s16(block + 11, row1, 5); + row1 = vld1q_lane_s16(block + 4, row1, 6); + row1 = vld1q_lane_s16(block + 5, row1, 7); + + int16x8_t row2 = vld1q_dup_s16(block + 12); + row2 = vld1q_lane_s16(block + 19, row2, 1); + row2 = vld1q_lane_s16(block + 26, row2, 2); + row2 = vld1q_lane_s16(block + 33, row2, 3); + row2 = vld1q_lane_s16(block + 40, row2, 4); + row2 = vld1q_lane_s16(block + 48, row2, 5); + row2 = vld1q_lane_s16(block + 41, row2, 6); + row2 = vld1q_lane_s16(block + 34, row2, 7); + + int16x8_t row3 = vld1q_dup_s16(block + 27); + row3 = vld1q_lane_s16(block + 20, row3, 1); + row3 = vld1q_lane_s16(block + 13, row3, 2); + row3 = vld1q_lane_s16(block + 6, row3, 3); + row3 = vld1q_lane_s16(block + 7, row3, 4); + row3 = vld1q_lane_s16(block + 14, row3, 5); + row3 = vld1q_lane_s16(block + 21, row3, 6); + row3 = vld1q_lane_s16(block + 28, row3, 7); + + int16x8_t abs_row0 = vabsq_s16(row0); + int16x8_t abs_row1 = vabsq_s16(row1); + int16x8_t abs_row2 = vabsq_s16(row2); + int16x8_t abs_row3 = vabsq_s16(row3); + + int16x8_t row0_lz = vclzq_s16(abs_row0); + int16x8_t row1_lz = vclzq_s16(abs_row1); + int16x8_t row2_lz = vclzq_s16(abs_row2); + int16x8_t row3_lz = vclzq_s16(abs_row3); + + /* Compute number of bits required to represent each coefficient. */ + uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row0_lz))); + uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row1_lz))); + uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row2_lz))); + uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row3_lz))); + + vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits); + vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits); + vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits); + vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits); + + uint16x8_t row0_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)), + vnegq_s16(row0_lz)); + uint16x8_t row1_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)), + vnegq_s16(row1_lz)); + uint16x8_t row2_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)), + vnegq_s16(row2_lz)); + uint16x8_t row3_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)), + vnegq_s16(row3_lz)); + + uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask); + uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask); + uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask); + uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask); + + /* Store diff values for rows 0, 1, 2, and 3. */ + vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff); + vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff); + vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff); + vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff); + + /* Load last four rows of coefficients from DCT block in zig-zag order. */ + int16x8_t row4 = vld1q_dup_s16(block + 35); + row4 = vld1q_lane_s16(block + 42, row4, 1); + row4 = vld1q_lane_s16(block + 49, row4, 2); + row4 = vld1q_lane_s16(block + 56, row4, 3); + row4 = vld1q_lane_s16(block + 57, row4, 4); + row4 = vld1q_lane_s16(block + 50, row4, 5); + row4 = vld1q_lane_s16(block + 43, row4, 6); + row4 = vld1q_lane_s16(block + 36, row4, 7); + + int16x8_t row5 = vld1q_dup_s16(block + 29); + row5 = vld1q_lane_s16(block + 22, row5, 1); + row5 = vld1q_lane_s16(block + 15, row5, 2); + row5 = vld1q_lane_s16(block + 23, row5, 3); + row5 = vld1q_lane_s16(block + 30, row5, 4); + row5 = vld1q_lane_s16(block + 37, row5, 5); + row5 = vld1q_lane_s16(block + 44, row5, 6); + row5 = vld1q_lane_s16(block + 51, row5, 7); + + int16x8_t row6 = vld1q_dup_s16(block + 58); + row6 = vld1q_lane_s16(block + 59, row6, 1); + row6 = vld1q_lane_s16(block + 52, row6, 2); + row6 = vld1q_lane_s16(block + 45, row6, 3); + row6 = vld1q_lane_s16(block + 38, row6, 4); + row6 = vld1q_lane_s16(block + 31, row6, 5); + row6 = vld1q_lane_s16(block + 39, row6, 6); + row6 = vld1q_lane_s16(block + 46, row6, 7); + + int16x8_t row7 = vld1q_dup_s16(block + 53); + row7 = vld1q_lane_s16(block + 60, row7, 1); + row7 = vld1q_lane_s16(block + 61, row7, 2); + row7 = vld1q_lane_s16(block + 54, row7, 3); + row7 = vld1q_lane_s16(block + 47, row7, 4); + row7 = vld1q_lane_s16(block + 55, row7, 5); + row7 = vld1q_lane_s16(block + 62, row7, 6); + row7 = vld1q_lane_s16(block + 63, row7, 7); + + int16x8_t abs_row4 = vabsq_s16(row4); + int16x8_t abs_row5 = vabsq_s16(row5); + int16x8_t abs_row6 = vabsq_s16(row6); + int16x8_t abs_row7 = vabsq_s16(row7); + + int16x8_t row4_lz = vclzq_s16(abs_row4); + int16x8_t row5_lz = vclzq_s16(abs_row5); + int16x8_t row6_lz = vclzq_s16(abs_row6); + int16x8_t row7_lz = vclzq_s16(abs_row7); + + /* Compute number of bits required to represent each coefficient. */ + uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row4_lz))); + uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row5_lz))); + uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row6_lz))); + uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16), + vmovn_u16(vreinterpretq_u16_s16(row7_lz))); + + vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits); + vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits); + vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits); + vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits); + + uint16x8_t row4_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)), + vnegq_s16(row4_lz)); + uint16x8_t row5_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)), + vnegq_s16(row5_lz)); + uint16x8_t row6_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)), + vnegq_s16(row6_lz)); + uint16x8_t row7_mask = + vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)), + vnegq_s16(row7_lz)); + + uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask); + uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask); + uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask); + uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask); + + /* Store diff values for rows 4, 5, 6, and 7. */ + vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff); + vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff); + vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff); + vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff); + + /* Construct bitmap to accelerate encoding of AC coefficients. A set bit + * means that the corresponding coefficient != 0. + */ + uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0)); + uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0)); + uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0)); + uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0)); + uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0)); + uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0)); + uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0)); + uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0)); + + /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ + const uint8x8_t bitmap_mask = + vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080)); + + row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask); + row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask); + row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask); + row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask); + row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask); + row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask); + row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask); + row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask); + + uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0); + uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0); + uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0); + uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0); + uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10); + uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54); + uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210); + + /* Shift left to remove DC bit. */ + bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1)); + /* Move bitmap to 32-bit scalar registers. */ + uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1); + uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0); + + /* Set up state and bit buffer for output bitstream. */ + working_state *state_ptr = (working_state *)state; + int free_bits = state_ptr->cur.free_bits; + size_t put_buffer = state_ptr->cur.put_buffer; + + /* Encode DC coefficient. */ + + unsigned int nbits = block_nbits[0]; + /* Emit Huffman-coded symbol and additional diff bits. */ + unsigned int diff = block_diff[0]; + PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff) + + /* Encode AC coefficients. */ + + unsigned int r = 0; /* r = run length of zeros */ + unsigned int i = 1; /* i = number of coefficients encoded */ + /* Code and size information for a run length of 16 zero coefficients */ + const unsigned int code_0xf0 = actbl->ehufco[0xf0]; + const unsigned int size_0xf0 = actbl->ehufsi[0xf0]; + + while (bitmap_1_32 != 0) { + r = BUILTIN_CLZ(bitmap_1_32); + i += r; + bitmap_1_32 <<= r; + nbits = block_nbits[i]; + diff = block_diff[i]; + while (r > 15) { + /* If run length > 15, emit special run-length-16 codes. */ + PUT_BITS(code_0xf0, size_0xf0) + r -= 16; + } + /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */ + unsigned int rs = (r << 4) + nbits; + PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff) + i++; + bitmap_1_32 <<= 1; + } + + r = 33 - i; + i = 33; + + while (bitmap_33_63 != 0) { + unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63); + r += leading_zeros; + i += leading_zeros; + bitmap_33_63 <<= leading_zeros; + nbits = block_nbits[i]; + diff = block_diff[i]; + while (r > 15) { + /* If run length > 15, emit special run-length-16 codes. */ + PUT_BITS(code_0xf0, size_0xf0) + r -= 16; + } + /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */ + unsigned int rs = (r << 4) + nbits; + PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff) + r = 0; + i++; + bitmap_33_63 <<= 1; + } + + /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code. + * The value of RS for the EOB code is 0. + */ + if (i != 64) { + PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) + } + + state_ptr->cur.put_buffer = put_buffer; + state_ptr->cur.free_bits = free_bits; + + return buffer; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c new file mode 100644 index 0000000000..e3adf23d50 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c @@ -0,0 +1,980 @@ +/* + * jsimd_arm.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2019, Google LLC. + * Copyright (C) 2020, Arm Limited. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit Arm architecture. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include +#include +#include + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature(char *buffer, char *feature) +{ + char *p; + + if (*feature == 0) + return 0; + if (strncmp(buffer, "Features", 8) != 0) + return 0; + buffer += 8; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "neon")) + simd_support |= JSIMD_NEON; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char env[2] = { 0 }; +#endif +#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ARM_NEON__) + simd_support |= JSIMD_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use Neon regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1")) + simd_support = JSIMD_NEON; + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) + simd_support = 0; + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_ycc_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_ycc_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_ycc_convert_neon; + break; + default: + neonfct = jsimd_extrgb_ycc_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_gray_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_gray_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_gray_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_gray_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_gray_convert_neon; + break; + default: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_ycc_extrgb_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_ycc_extbgr_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_ycc_extxrgb_convert_neon; + break; + default: + neonfct = jsimd_ycc_extrgb_convert_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v2_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v1_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_neon(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_neon(block, + jpeg_natural_order_start, Sl, + Al, absvalues, bits); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S new file mode 100644 index 0000000000..7e1e2b1451 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S @@ -0,0 +1,1200 @@ +/* + * Armv7 Neon optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka + * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. + * Copyright (C) 2014, Linaro Limited. All Rights Reserved. + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.syntax unified + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + JLONG q1, q2, q3, q4, q5, q6, q7; \ + JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_2_562915447 d0[2] +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] +#define XFIX_1_175875602 d1[3] +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +asm_function jsimd_idct_islow_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + ROW0L .req d16 + ROW0R .req d17 + ROW1L .req d18 + ROW1R .req d19 + ROW2L .req d20 + ROW2R .req d21 + ROW3L .req d22 + ROW3R .req d23 + ROW4L .req d24 + ROW4R .req d25 + ROW5L .req d26 + ROW5R .req d27 + ROW6L .req d28 + ROW6R .req d29 + ROW7L .req d30 + ROW7R .req d31 + + /* Load and dequantize coefficients into Neon registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_islow_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ + add ip, ip, #16 + vmul.s16 q15, q15, q3 + vpush {d8 - d15} /* save Neon registers */ + /* 1-D IDCT, pass 1, left 4x8 half */ + vadd.s16 d4, ROW7L, ROW3L + vadd.s16 d5, ROW5L, ROW1L + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d5, XFIX_1_175875602 + vmull.s16 q7, d4, XFIX_1_175875602 + /* Check for the zero coefficients in the right 4x8 half */ + push {r4, r5} + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + orr r0, r4, r5 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + orr r0, r0, r4 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + orr r0, r0, r5 + vadd.s32 q1, q3, q2 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + vmov q5, q7 + vadd.s32 q1, q1, q6 + orr r0, r0, r4 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + orr r0, r0, r5 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1L, q1, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + orr r0, r0, r4 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + orr r0, r0, r5 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + orr r0, r0, r4 + vrshrn.s32 ROW6L, q1, #11 + orr r0, r0, r5 + vadd.s32 q1, q3, q5 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + orr r0, r0, r4 + vrshrn.s32 ROW2L, q1, #11 + orr r0, r0, r5 + vrshrn.s32 ROW5L, q3, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + orr r0, r0, r4 + vadd.s32 q2, q5, q6 + orrs r0, r0, r5 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + orr r0, r4, r5 + vsub.s32 q3, q1, q4 + pop {r4, r5} + vrshrn.s32 ROW7L, q2, #11 + vrshrn.s32 ROW3L, q5, #11 + vrshrn.s32 ROW0L, q6, #11 + vrshrn.s32 ROW4L, q3, #11 + + beq 3f /* Go to do some special handling for the sparse + right 4x8 half */ + + /* 1-D IDCT, pass 1, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vtrn.16 ROW2L, ROW3L + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vtrn.16 ROW0L, ROW1L + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vtrn.16 ROW4L, ROW5L + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vtrn.32 ROW1L, ROW3L + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vtrn.32 ROW4L, ROW6L + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vtrn.32 ROW0L, ROW2L + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1R, q1, #11 + vtrn.32 ROW5L, ROW7L + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6R, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vrshrn.s32 ROW2R, q1, #11 + vrshrn.s32 ROW5R, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7R, q2, #11 + vrshrn.s32 ROW3R, q5, #11 + vrshrn.s32 ROW0R, q6, #11 + vrshrn.s32 ROW4R, q3, #11 + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ + vmov q4, q6 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5R, XFIX_1_175875602 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmull.s16 q7, ROW7R, XFIX_1_175875602 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + +2: /* Descale to 8-bit and range limit */ + vqrshrn.s16 d16, q8, #2 + vqrshrn.s16 d17, q9, #2 + vqrshrn.s16 d18, q10, #2 + vqrshrn.s16 d19, q11, #2 + vpop {d8 - d15} /* restore Neon registers */ + vqrshrn.s16 d20, q12, #2 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + vtrn.16 q8, q9 + vqrshrn.s16 d21, q13, #2 + vqrshrn.s16 d22, q14, #2 + vmov.u8 q0, #(CENTERJSAMPLE) + vqrshrn.s16 d23, q15, #2 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vtrn.16 q10, q11 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vadd.u8 q10, q10, q0 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vtrn.8 d22, d23 + vst1.8 {d20}, [TMP1] + vadd.u8 q11, q11, q0 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vtrn.16 ROW2L, ROW3L + vtrn.16 ROW0L, ROW1L + vtrn.16 ROW4L, ROW5L + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ + vtrn.32 ROW1L, ROW3L + vtrn.32 ROW4L, ROW6L + vtrn.32 ROW0L, ROW2L + vtrn.32 ROW5L, ROW7L + + cmp r0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second + pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + vdup.s16 ROW1R, ROW0R[1] + vdup.s16 ROW2R, ROW0R[2] + vdup.s16 ROW3R, ROW0R[3] + vdup.s16 ROW4R, ROW0R[0] + vdup.s16 ROW5R, ROW0R[1] + vdup.s16 ROW6R, ROW0R[2] + vdup.s16 ROW7R, ROW0R[3] + vdup.s16 ROW0R, ROW0R[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vshll.s16 q3, ROW0L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW0L, #13 + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5L, XFIX_1_175875602 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW7L, XFIX_1_175875602 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW6L, XFIX_0_541196100 + vshll.s16 q3, ROW4L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW4L, #13 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + b 2b /* Go to epilogue */ + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in Arm Neon case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 d0[0] +#define XFIX_1_414213562 d0[1] +#define XFIX_1_847759065 d0[2] +#define XFIX_2_613125930 d0[3] + +.balign 16 +jsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + /* Load and dequantize coefficients into Neon registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_ifast_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0}, [ip, :64] /* load constants */ + vmul.s16 q15, q15, q3 + vpush {d8 - d13} /* save Neon registers */ + /* 1-D IDCT, pass 1 */ + vsub.s16 q2, q10, q14 + vadd.s16 q14, q10, q14 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d28, d21 + vswp d26, d19 + /* 1-D IDCT, pass 2 */ + vsub.s16 q2, q10, q14 + vswp d30, d23 + vadd.s16 q14, q10, q14 + vswp d24, d17 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vpop {d8 - d13} /* restore Neon registers */ + vadd.s16 q10, q10, q2 + vsub.s16 q11, q12, q1 + vadd.s16 q12, q12, q1 + /* Descale to 8-bit and range limit */ + vmov.u8 q0, #0x80 + vqshrn.s16 d16, q8, #5 + vqshrn.s16 d17, q9, #5 + vqshrn.s16 d18, q10, #5 + vqshrn.s16 d19, q11, #5 + vqshrn.s16 d20, q12, #5 + vqshrn.s16 d21, q13, #5 + vqshrn.s16 d22, q14, #5 + vqshrn.s16 d23, q15, #5 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vadd.u8 q10, q10, q0 + vadd.u8 q11, q11, q0 + /* Transpose the final 8-bit samples */ + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vtrn.8 d22, d23 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + vst1.8 {d20}, [Y]! + vst1.8 {d21}, [U]! + vst1.8 {d22}, [V]! + .elseif \size == 4 + vst1.8 {d20[0]}, [Y]! + vst1.8 {d20[1]}, [Y]! + vst1.8 {d20[2]}, [Y]! + vst1.8 {d20[3]}, [Y]! + vst1.8 {d21[0]}, [U]! + vst1.8 {d21[1]}, [U]! + vst1.8 {d21[2]}, [U]! + vst1.8 {d21[3]}, [U]! + vst1.8 {d22[0]}, [V]! + vst1.8 {d22[1]}, [V]! + vst1.8 {d22[2]}, [V]! + vst1.8 {d22[3]}, [V]! + .elseif \size == 2 + vst1.8 {d20[4]}, [Y]! + vst1.8 {d20[5]}, [Y]! + vst1.8 {d21[4]}, [U]! + vst1.8 {d21[5]}, [U]! + vst1.8 {d22[4]}, [V]! + vst1.8 {d22[5]}, [V]! + .elseif \size == 1 + vst1.8 {d20[6]}, [Y]! + vst1.8 {d21[6]}, [U]! + vst1.8 {d22[6]}, [V]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + vld3.8 {d10, d11, d12}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vld4.8 {d10, d11, d12, d13}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vrev64.32 q9, q1 + vrev64.32 q13, q1 + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.macro do_rgb_to_yuv_stage2 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vshrn.u32 d23, q13, #16 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + vmovn.u16 d20, q10 /* d20 = y */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovn.u16 d22, q12 /* d22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vrev64.32 q9, q1 + vshrn.u32 d23, q13, #16 + vrev64.32 q13, q1 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + do_load \bpp, 8 + vmovn.u16 d20, q10 /* d20 = y */ + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovn.u16 d22, q12 /* d22 = v */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vst1.8 {d20}, [Y]! + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vst1.8 {d21}, [U]! + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vst1.8 {d22}, [V]! + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.balign 16 +jsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + OUTPUT_BUF .req r2 + OUTPUT_ROW .req r3 + NUM_ROWS .req r4 + + OUTPUT_BUF0 .req r5 + OUTPUT_BUF1 .req r6 + OUTPUT_BUF2 .req OUTPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d0, d1, d2, d3 */ + adr ip, jsimd_\colorid\()_ycc_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save Arm registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] + .unreq OUTPUT_BUF + + /* Save Neon registers */ + vpush {d8 - d15} + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + blt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load \bpp, 4 +3: + tst N, #2 + beq 4f + do_load \bpp, 2 +4: + tst N, #1 + beq 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tst N, #4 + beq 6f + do_store 4 +6: + tst N, #2 + beq 7f + do_store 2 +7: + tst N, #1 + beq 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8 - d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c new file mode 100644 index 0000000000..37130c225e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c @@ -0,0 +1,316 @@ +/* + * jccolext-neon.c - colorspace conversion (64-bit Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-neon.c */ + + +/* RGB -> YCbCr conversion is defined by the following equations: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * 0.16874695 = 11059 * 2^-16 + * 0.33125305 = 21709 * 2^-16 + * 0.50000000 = 32768 * 2^-16 + * 0.41868592 = 27439 * 2^-16 + * 0.08131409 = 5329 * 2^-16 + * These constants are defined in jccolor-neon.c + * + * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively + * rounds up or down the result via integer truncation. + */ + +void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + /* Pointer to RGB(X/A) input data */ + JSAMPROW inptr; + /* Pointers to Y, Cb, and Cr output data */ + JSAMPROW outptr0, outptr1, outptr2; + /* Allocate temporary buffer for final (image_width % 16) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; + + /* Set up conversion constants. */ + const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts); + const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767); + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2); + uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0); + y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1); + y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2); + uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2); + uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0); + y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1); + y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_ll = scaled_128_5; + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3); + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4); + cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5); + uint32x4_t cb_lh = scaled_128_5; + cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3); + cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4); + cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5); + uint32x4_t cb_hl = scaled_128_5; + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3); + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4); + cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5); + uint32x4_t cb_hh = scaled_128_5; + cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3); + cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4); + cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_ll = scaled_128_5; + cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7); + uint32x4_t cr_lh = scaled_128_5; + cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5); + cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6); + cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7); + uint32x4_t cr_hl = scaled_128_5; + cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7); + uint32x4_t cr_hh = scaled_128_5; + cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5); + cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6); + cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16), + vshrn_n_u32(cb_lh, 16)); + uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16), + vshrn_n_u32(cb_hh, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16), + vshrn_n_u32(cr_lh, 16)); + uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16), + vshrn_n_u32(cr_hh, 16)); + /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer + * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. + */ + vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h))); + vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h))); + + /* Increment pointers. */ + inptr += (16 * RGB_PIXELSIZE); + outptr0 += 16; + outptr1 += 16; + outptr2 += 16; + } + + if (cols_remaining > 8) { + /* To prevent buffer overread by the vector load instructions, the last + * (image_width % 16) columns of data are first memcopied to a temporary + * buffer large enough to accommodate the vector load. + */ + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2); + uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0); + y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1); + y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2); + uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2); + uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0); + y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1); + y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_ll = scaled_128_5; + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3); + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4); + cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5); + uint32x4_t cb_lh = scaled_128_5; + cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3); + cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4); + cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5); + uint32x4_t cb_hl = scaled_128_5; + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3); + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4); + cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5); + uint32x4_t cb_hh = scaled_128_5; + cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3); + cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4); + cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_ll = scaled_128_5; + cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7); + uint32x4_t cr_lh = scaled_128_5; + cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5); + cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6); + cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7); + uint32x4_t cr_hl = scaled_128_5; + cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7); + uint32x4_t cr_hh = scaled_128_5; + cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5); + cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6); + cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16), + vshrn_n_u32(cb_lh, 16)); + uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16), + vshrn_n_u32(cb_hh, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16), + vshrn_n_u32(cr_lh, 16)); + uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16), + vshrn_n_u32(cr_hh, 16)); + /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer + * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. + */ + vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h))); + vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h))); + + } else if (cols_remaining > 0) { + /* To prevent buffer overread by the vector load instructions, the last + * (image_width % 8) columns of data are first memcopied to a temporary + * buffer large enough to accommodate the vector load. + */ + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t input_pixels = vld4_u8(inptr); +#else + uint8x8x3_t input_pixels = vld3_u8(inptr); +#endif + uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]); + uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]); + uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0); + y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1); + y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2); + uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0); + y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1); + y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_l = scaled_128_5; + cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3); + cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4); + cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5); + uint32x4_t cb_h = scaled_128_5; + cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3); + cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4); + cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_l = scaled_128_5; + cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5); + cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6); + cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7); + uint32x4_t cr_h = scaled_128_5; + cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5); + cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6); + cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16), + vrshrn_n_u32(y_h, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16), + vshrn_n_u32(cb_h, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16), + vshrn_n_u32(cr_h, 16)); + /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer + * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. + */ + vst1_u8(outptr0, vmovn_u16(y_u16)); + vst1_u8(outptr1, vmovn_u16(cb_u16)); + vst1_u8(outptr2, vmovn_u16(cr_u16)); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c new file mode 100644 index 0000000000..607a116070 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c @@ -0,0 +1,411 @@ +/* + * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon) + * + * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * NOTE: All referenced figures are from + * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" +#include "../align.h" +#include "../jchuff.h" +#include "neon-compat.h" + +#include + +#include + + +ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = { + 0, 1, 2, 3, 16, 17, 32, 33, + 18, 19, 4, 5, 6, 7, 20, 21, + 34, 35, 48, 49, 255, 255, 50, 51, + 36, 37, 22, 23, 8, 9, 10, 11, + 255, 255, 6, 7, 20, 21, 34, 35, + 48, 49, 255, 255, 50, 51, 36, 37, + 54, 55, 40, 41, 26, 27, 12, 13, + 14, 15, 28, 29, 42, 43, 56, 57, + 6, 7, 20, 21, 34, 35, 48, 49, + 50, 51, 36, 37, 22, 23, 8, 9, + 26, 27, 12, 13, 255, 255, 14, 15, + 28, 29, 42, 43, 56, 57, 255, 255, + 52, 53, 54, 55, 40, 41, 26, 27, + 12, 13, 255, 255, 14, 15, 28, 29, + 26, 27, 40, 41, 42, 43, 28, 29, + 14, 15, 30, 31, 44, 45, 46, 47 +}; + +/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned + * address warning because the macro sometimes writes a 64-bit value to a + * non-64-bit-aligned address. That behavior is technically undefined per + * the C specification, but it is supported by the AArch64 architecture and + * compilers. + */ +#if defined(__has_feature) +#if __has_feature(undefined_behavior_sanitizer) +__attribute__((no_sanitize("alignment"))) +#endif +#endif +JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, + JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + uint16_t block_diff[DCTSIZE2]; + + /* Load lookup table indices for rows of zig-zag ordering. */ +#ifdef HAVE_VLD1Q_U8_X4 + const uint8x16x4_t idx_rows_0123 = + vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE); + const uint8x16x4_t idx_rows_4567 = + vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE); +#else + /* GCC does not currently support intrinsics vl1dq__x4(). */ + const uint8x16x4_t idx_rows_0123 = { { + vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE) + } }; + const uint8x16x4_t idx_rows_4567 = { { + vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE), + vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE) + } }; +#endif + + /* Load 8x8 block of DCT coefficients. */ +#ifdef HAVE_VLD1Q_U8_X4 + const int8x16x4_t tbl_rows_0123 = + vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE)); + const int8x16x4_t tbl_rows_4567 = + vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE)); +#else + const int8x16x4_t tbl_rows_0123 = { { + vld1q_s8((int8_t *)(block + 0 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 1 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 2 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 3 * DCTSIZE)) + } }; + const int8x16x4_t tbl_rows_4567 = { { + vld1q_s8((int8_t *)(block + 4 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 5 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 6 * DCTSIZE)), + vld1q_s8((int8_t *)(block + 7 * DCTSIZE)) + } }; +#endif + + /* Initialise extra lookup tables. */ + const int8x16x4_t tbl_rows_2345 = { { + tbl_rows_0123.val[2], tbl_rows_0123.val[3], + tbl_rows_4567.val[0], tbl_rows_4567.val[1] + } }; + const int8x16x3_t tbl_rows_567 = + { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } }; + + /* Shuffle coefficients into zig-zag order. */ + int16x8_t row0 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0])); + int16x8_t row1 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1])); + int16x8_t row2 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2])); + int16x8_t row3 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3])); + int16x8_t row4 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0])); + int16x8_t row5 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1])); + int16x8_t row6 = + vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2])); + int16x8_t row7 = + vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3])); + + /* Compute DC coefficient difference value (F.1.1.5.1). */ + row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0); + /* Initialize AC coefficient lanes not reachable by lookup tables. */ + row1 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]), + 0), row1, 2); + row2 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]), + 4), row2, 0); + row2 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]), + 0), row2, 5); + row5 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]), + 7), row5, 2); + row5 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]), + 3), row5, 7); + row6 = + vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]), + 7), row6, 5); + + /* DCT block is now in zig-zag order; start Huffman encoding process. */ + + /* Construct bitmap to accelerate encoding of AC coefficients. A set bit + * means that the corresponding coefficient != 0. + */ + uint16x8_t row0_ne_0 = vtstq_s16(row0, row0); + uint16x8_t row1_ne_0 = vtstq_s16(row1, row1); + uint16x8_t row2_ne_0 = vtstq_s16(row2, row2); + uint16x8_t row3_ne_0 = vtstq_s16(row3, row3); + uint16x8_t row4_ne_0 = vtstq_s16(row4, row4); + uint16x8_t row5_ne_0 = vtstq_s16(row5, row5); + uint16x8_t row6_ne_0 = vtstq_s16(row6, row6); + uint16x8_t row7_ne_0 = vtstq_s16(row7, row7); + + uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0), + vreinterpretq_u8_u16(row0_ne_0)); + uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0), + vreinterpretq_u8_u16(row2_ne_0)); + uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0), + vreinterpretq_u8_u16(row4_ne_0)); + uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0), + vreinterpretq_u8_u16(row6_ne_0)); + + /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ + const uint8x16_t bitmap_mask = + vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080)); + + uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask); + + uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10); + uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54); + uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654, + bitmap_rows_3210); + uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210), + vget_high_u8(bitmap_rows_76543210)); + + /* Shift left to remove DC bit. */ + bitmap_all = + vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1)); + /* Count bits set (number of non-zero coefficients) in bitmap. */ + unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all)); + /* Move bitmap to 64-bit scalar register. */ + uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); + + /* Set up state and bit buffer for output bitstream. */ + working_state *state_ptr = (working_state *)state; + int free_bits = state_ptr->cur.free_bits; + size_t put_buffer = state_ptr->cur.put_buffer; + + /* Encode DC coefficient. */ + + /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */ + int16x8_t abs_row0 = vabsq_s16(row0); + int16x8_t row0_lz = vclzq_s16(abs_row0); + uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz)); + uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask); + /* Find nbits required to specify sign and amplitude of coefficient. */ + unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0); + unsigned int nbits = 16 - lz; + /* Emit Huffman-coded symbol and additional diff bits. */ + unsigned int diff = vgetq_lane_u16(row0_diff, 0); + PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff) + + /* Encode AC coefficients. */ + + unsigned int r = 0; /* r = run length of zeros */ + unsigned int i = 1; /* i = number of coefficients encoded */ + /* Code and size information for a run length of 16 zero coefficients */ + const unsigned int code_0xf0 = actbl->ehufco[0xf0]; + const unsigned int size_0xf0 = actbl->ehufsi[0xf0]; + + /* The most efficient method of computing nbits and diff depends on the + * number of non-zero coefficients. If the bitmap is not too sparse (> 8 + * non-zero AC coefficients), it is beneficial to do all of the work using + * Neon; else we do some of the work using Neon and the rest on demand using + * scalar code. + */ + if (non_zero_coefficients > 8) { + uint8_t block_nbits[DCTSIZE2]; + + int16x8_t abs_row1 = vabsq_s16(row1); + int16x8_t abs_row2 = vabsq_s16(row2); + int16x8_t abs_row3 = vabsq_s16(row3); + int16x8_t abs_row4 = vabsq_s16(row4); + int16x8_t abs_row5 = vabsq_s16(row5); + int16x8_t abs_row6 = vabsq_s16(row6); + int16x8_t abs_row7 = vabsq_s16(row7); + int16x8_t row1_lz = vclzq_s16(abs_row1); + int16x8_t row2_lz = vclzq_s16(abs_row2); + int16x8_t row3_lz = vclzq_s16(abs_row3); + int16x8_t row4_lz = vclzq_s16(abs_row4); + int16x8_t row5_lz = vclzq_s16(abs_row5); + int16x8_t row6_lz = vclzq_s16(abs_row6); + int16x8_t row7_lz = vclzq_s16(abs_row7); + /* Narrow leading zero count to 8 bits. */ + uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz), + vreinterpretq_u8_s16(row1_lz)); + uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz), + vreinterpretq_u8_s16(row3_lz)); + uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz), + vreinterpretq_u8_s16(row5_lz)); + uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz), + vreinterpretq_u8_s16(row7_lz)); + /* Compute nbits needed to specify magnitude of each coefficient. */ + uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz); + uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz); + uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz); + uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz); + /* Store nbits. */ + vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits); + vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits); + vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits); + vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits); + /* Mask bits not required to specify sign and amplitude of diff. */ + uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz)); + uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz)); + uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz)); + uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz)); + uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz)); + uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz)); + uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz)); + /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */ + uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), + row1_mask); + uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), + row2_mask); + uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), + row3_mask); + uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), + row4_mask); + uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), + row5_mask); + uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), + row6_mask); + uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), + row7_mask); + /* Store diff bits. */ + vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff); + vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff); + vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff); + vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff); + vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff); + vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff); + vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff); + vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff); + + while (bitmap != 0) { + r = BUILTIN_CLZLL(bitmap); + i += r; + bitmap <<= r; + nbits = block_nbits[i]; + diff = block_diff[i]; + while (r > 15) { + /* If run length > 15, emit special run-length-16 codes. */ + PUT_BITS(code_0xf0, size_0xf0) + r -= 16; + } + /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */ + unsigned int rs = (r << 4) + nbits; + PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff) + i++; + bitmap <<= 1; + } + } else if (bitmap != 0) { + uint16_t block_abs[DCTSIZE2]; + /* Compute and store absolute value of coefficients. */ + int16x8_t abs_row1 = vabsq_s16(row1); + int16x8_t abs_row2 = vabsq_s16(row2); + int16x8_t abs_row3 = vabsq_s16(row3); + int16x8_t abs_row4 = vabsq_s16(row4); + int16x8_t abs_row5 = vabsq_s16(row5); + int16x8_t abs_row6 = vabsq_s16(row6); + int16x8_t abs_row7 = vabsq_s16(row7); + vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0)); + vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1)); + vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2)); + vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3)); + vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4)); + vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5)); + vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6)); + vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7)); + /* Compute diff bits (without nbits mask) and store. */ + uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), + vcltzq_s16(row1)); + uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), + vcltzq_s16(row2)); + uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), + vcltzq_s16(row3)); + uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), + vcltzq_s16(row4)); + uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), + vcltzq_s16(row5)); + uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), + vcltzq_s16(row6)); + uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), + vcltzq_s16(row7)); + vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff); + vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff); + vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff); + vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff); + vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff); + vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff); + vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff); + vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff); + + /* Same as above but must mask diff bits and compute nbits on demand. */ + while (bitmap != 0) { + r = BUILTIN_CLZLL(bitmap); + i += r; + bitmap <<= r; + lz = BUILTIN_CLZ(block_abs[i]); + nbits = 32 - lz; + diff = ((unsigned int)block_diff[i] << lz) >> lz; + while (r > 15) { + /* If run length > 15, emit special run-length-16 codes. */ + PUT_BITS(code_0xf0, size_0xf0) + r -= 16; + } + /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */ + unsigned int rs = (r << 4) + nbits; + PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff) + i++; + bitmap <<= 1; + } + } + + /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code. + * The value of RS for the EOB code is 0. + */ + if (i != 64) { + PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) + } + + state_ptr->cur.put_buffer = put_buffer; + state_ptr->cur.free_bits = free_bits; + + return buffer; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c new file mode 100644 index 0000000000..604d5472f6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c @@ -0,0 +1,1058 @@ +/* + * jsimd_arm64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2020, Arm Limited. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit Arm architecture. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" +#include "jconfigint.h" + +#include +#include +#include + +#define JSIMD_FASTLD3 1 +#define JSIMD_FASTST3 2 +#define JSIMD_FASTTBL 4 + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; +static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | + JSIMD_FASTTBL; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_cpuinfo(char *buffer, const char *field, char *value) +{ + char *p; + + if (*value == 0) + return 0; + if (strncmp(buffer, field, strlen(field)) != 0) + return 0; + buffer += strlen(field); + while (isspace(*buffer)) + buffer++; + + /* Check if 'value' is present in the buffer as a separate word */ + while ((p = strstr(buffer, value))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(value); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_cpuinfo(buffer, "CPU part", "0xd03") || + check_cpuinfo(buffer, "CPU part", "0xd07")) + /* The Cortex-A53 has a slow tbl implementation. We can gain a few + percent speedup by disabling the use of that instruction. The + speedup on Cortex-A57 is more subtle but still measurable. */ + simd_features &= ~JSIMD_FASTTBL; + else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) + /* The SIMD version of Huffman encoding is slower than the C version on + Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that + CPU. */ + simd_huffman = simd_features = 0; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ + +/* + * Armv8 architectures support Neon extensions by default. + * It is no longer optional as it was with Armv7. + */ + + +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char env[2] = { 0 }; +#endif +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + + simd_support |= JSIMD_NEON; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1")) + simd_support = JSIMD_NEON; + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) + simd_support = 0; + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) + simd_huffman = 0; + if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1")) + simd_features |= JSIMD_FASTLD3; + if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0")) + simd_features &= ~JSIMD_FASTLD3; + if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1")) + simd_features |= JSIMD_FASTST3; + if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0")) + simd_features &= ~JSIMD_FASTST3; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTLD3) +#endif + neonfct = jsimd_extrgb_ycc_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; +#endif + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTLD3) +#endif + neonfct = jsimd_extbgr_ycc_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_extbgr_ycc_convert_neon_slowld3; +#endif + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_ycc_convert_neon; + break; + default: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTLD3) +#endif + neonfct = jsimd_extrgb_ycc_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; +#endif + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_gray_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_gray_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_gray_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_gray_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_gray_convert_neon; + break; + default: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTST3) +#endif + neonfct = jsimd_ycc_extrgb_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; +#endif + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTST3) +#endif + neonfct = jsimd_ycc_extbgr_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_ycc_extbgr_convert_neon_slowst3; +#endif + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_ycc_extxrgb_convert_neon; + break; + default: +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTST3) +#endif + neonfct = jsimd_ycc_extrgb_convert_neon; +#ifndef NEON_INTRINSICS + else + neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; +#endif + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v2_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v1_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_neon(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ +#ifndef NEON_INTRINSICS + if (simd_features & JSIMD_FASTTBL) +#endif + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); +#ifndef NEON_INTRINSICS + else + return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, + last_dc_val, dctbl, actbl); +#endif +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_neon(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S new file mode 100644 index 0000000000..738a4f0658 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S @@ -0,0 +1,2254 @@ +/* + * Armv8 Neon optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. + * Author: Ragesh Radhakrishnan + * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +#if defined(__APPLE__) +.section __DATA, __const +#elif defined(_WIN32) +.section .rdata +#else +.section .rodata, "a", %progbits +#endif + +/* Constants for jsimd_idct_islow_neon() */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_idct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +/* Constants for jsimd_ycc_*_neon() */ + +.balign 16 +Ljsimd_ycc_rgb_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +/* Constants for jsimd_*_ycc_neon() */ + +.balign 16 +Ljsimd_rgb_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +/* Constants for jsimd_fdct_islow_neon() */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_fdct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +/* Constants for jsimd_huff_encode_one_block_neon() */ + +.balign 16 +Ljsimd_huff_encode_one_block_neon_consts: + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + .byte 0, 1, 2, 3, 16, 17, 32, 33, \ + 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, \ + 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, \ + 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, \ + 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, \ + 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, \ + 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, \ + 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, \ + 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + .byte 255, 255, 255, 255, 0, 1, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ + +.text + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Get symbol location */ +.macro get_symbol_loc reg, symbol +#ifdef __APPLE__ + adrp \reg, \symbol@PAGE + add \reg, \reg, \symbol@PAGEOFF +#else + adrp \reg, \symbol + add \reg, \reg, :lo12:\symbol +#endif +.endm + +.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 + trn1 \t0\().8h, \l0\().8h, \l1\().8h + trn1 \t1\().8h, \l2\().8h, \l3\().8h + trn1 \t2\().8h, \l4\().8h, \l5\().8h + trn1 \t3\().8h, \l6\().8h, \l7\().8h + trn2 \l1\().8h, \l0\().8h, \l1\().8h + trn2 \l3\().8h, \l2\().8h, \l3\().8h + trn2 \l5\().8h, \l4\().8h, \l5\().8h + trn2 \l7\().8h, \l6\().8h, \l7\().8h + + trn1 \l4\().4s, \t2\().4s, \t3\().4s + trn2 \t3\().4s, \t2\().4s, \t3\().4s + trn1 \t2\().4s, \t0\().4s, \t1\().4s + trn2 \l2\().4s, \t0\().4s, \t1\().4s + trn1 \t0\().4s, \l1\().4s, \l3\().4s + trn2 \l3\().4s, \l1\().4s, \l3\().4s + trn2 \t1\().4s, \l5\().4s, \l7\().4s + trn1 \l5\().4s, \l5\().4s, \l7\().4s + + trn2 \l6\().2d, \l2\().2d, \t3\().2d + trn1 \l0\().2d, \t2\().2d, \l4\().2d + trn1 \l1\().2d, \t0\().2d, \l5\().2d + trn2 \l7\().2d, \l3\().2d, \t1\().2d + trn1 \l2\().2d, \l2\().2d, \t3\().2d + trn2 \l4\().2d, \t2\().2d, \l4\().2d + trn1 \l3\().2d, \l3\().2d, \t1\().2d + trn2 \l5\().2d, \t0\().2d, \l5\().2d +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_idct_islow_neon + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + sub sp, sp, #64 + get_symbol_loc x15, Ljsimd_idct_islow_neon_consts + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x15] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 + ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 + ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 + + cmeq v16.8h, v3.8h, #0 + cmeq v26.8h, v4.8h, #0 + cmeq v27.8h, v5.8h, #0 + cmeq v28.8h, v6.8h, #0 + cmeq v29.8h, v7.8h, #0 + cmeq v30.8h, v8.8h, #0 + cmeq v31.8h, v9.8h, #0 + + and v10.16b, v16.16b, v26.16b + and v11.16b, v27.16b, v28.16b + and v12.16b, v29.16b, v30.16b + and v13.16b, v31.16b, v10.16b + and v14.16b, v11.16b, v12.16b + mul v2.8h, v2.8h, v18.8h + and v15.16b, v13.16b, v14.16b + shl v10.8h, v2.8h, #(PASS1_BITS) + sqxtn v16.8b, v15.8h + mov TMP1, v16.d[0] + mvn TMP2, TMP1 + + cbnz TMP2, 2f + /* case all AC coeffs are zeros */ + dup v2.2d, v10.d[0] + dup v6.2d, v10.d[1] + mov v3.16b, v2.16b + mov v7.16b, v6.16b + mov v4.16b, v2.16b + mov v8.16b, v6.16b + mov v5.16b, v2.16b + mov v9.16b, v6.16b +1: + /* for this transpose, we should organise data like this: + * 00, 01, 02, 03, 40, 41, 42, 43 + * 10, 11, 12, 13, 50, 51, 52, 53 + * 20, 21, 22, 23, 60, 61, 62, 63 + * 30, 31, 32, 33, 70, 71, 72, 73 + * 04, 05, 06, 07, 44, 45, 46, 47 + * 14, 15, 16, 17, 54, 55, 56, 57 + * 24, 25, 26, 27, 64, 65, 66, 67 + * 34, 35, 36, 37, 74, 75, 76, 77 + */ + trn1 v28.8h, v2.8h, v3.8h + trn1 v29.8h, v4.8h, v5.8h + trn1 v30.8h, v6.8h, v7.8h + trn1 v31.8h, v8.8h, v9.8h + trn2 v16.8h, v2.8h, v3.8h + trn2 v17.8h, v4.8h, v5.8h + trn2 v18.8h, v6.8h, v7.8h + trn2 v19.8h, v8.8h, v9.8h + trn1 v2.4s, v28.4s, v29.4s + trn1 v6.4s, v30.4s, v31.4s + trn1 v3.4s, v16.4s, v17.4s + trn1 v7.4s, v18.4s, v19.4s + trn2 v4.4s, v28.4s, v29.4s + trn2 v8.4s, v30.4s, v31.4s + trn2 v5.4s, v16.4s, v17.4s + trn2 v9.4s, v18.4s, v19.4s + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + movi v0.16b, #(CENTERJSAMPLE) + /* Prepare pointers (dual-issue with Neon instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP1, TMP1, OUTPUT_COL + sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP2, TMP2, OUTPUT_COL + sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP3, TMP3, OUTPUT_COL + sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP4, TMP4, OUTPUT_COL + sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + blr x30 + +.balign 16 +2: + mul v3.8h, v3.8h, v19.8h + mul v4.8h, v4.8h, v20.8h + mul v5.8h, v5.8h, v21.8h + add TMP4, xzr, TMP2, LSL #32 + mul v6.8h, v6.8h, v22.8h + mul v7.8h, v7.8h, v23.8h + adds TMP3, xzr, TMP2, LSR #32 + mul v8.8h, v8.8h, v24.8h + mul v9.8h, v9.8h, v25.8h + b.ne 3f + /* Right AC coef is zero */ + dup v15.2d, v10.d[1] + /* Even part: reverse the even part of the forward DCT. */ + add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + mov v6.16b, v15.16b + mov v7.16b, v15.16b + mov v8.16b, v15.16b + mov v9.16b, v15.16b + b 1b + +.balign 16 +3: + cbnz TMP4, 4f + /* Left AC coef is zero */ + dup v14.2d, v10.d[0] + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + mov v2.16b, v14.16b + mov v3.16b, v14.16b + mov v4.16b, v14.16b + mov v5.16b, v14.16b + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + +.balign 16 +4: + /* "No" AC coef is zero */ + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + +#undef CENTERJSAMPLE +#undef CONST_BITS +#undef PASS1_BITS +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro do_load size + .if \size == 8 + ld1 {v4.8b}, [U], 8 + ld1 {v5.8b}, [V], 8 + ld1 {v0.8b}, [Y], 8 + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + .elseif \size == 4 + ld1 {v4.b}[0], [U], 1 + ld1 {v4.b}[1], [U], 1 + ld1 {v4.b}[2], [U], 1 + ld1 {v4.b}[3], [U], 1 + ld1 {v5.b}[0], [V], 1 + ld1 {v5.b}[1], [V], 1 + ld1 {v5.b}[2], [V], 1 + ld1 {v5.b}[3], [V], 1 + ld1 {v0.b}[0], [Y], 1 + ld1 {v0.b}[1], [Y], 1 + ld1 {v0.b}[2], [Y], 1 + ld1 {v0.b}[3], [Y], 1 + .elseif \size == 2 + ld1 {v4.b}[4], [U], 1 + ld1 {v4.b}[5], [U], 1 + ld1 {v5.b}[4], [V], 1 + ld1 {v5.b}[5], [V], 1 + ld1 {v0.b}[4], [Y], 1 + ld1 {v0.b}[5], [Y], 1 + .elseif \size == 1 + ld1 {v4.b}[6], [U], 1 + ld1 {v5.b}[6], [V], 1 + ld1 {v0.b}[6], [Y], 1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size, fast_st3 + .if \bpp == 24 + .if \size == 8 + .if \fast_st3 == 1 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 + .else + st1 {v10.b}[0], [RGB], #1 + st1 {v11.b}[0], [RGB], #1 + st1 {v12.b}[0], [RGB], #1 + + st1 {v10.b}[1], [RGB], #1 + st1 {v11.b}[1], [RGB], #1 + st1 {v12.b}[1], [RGB], #1 + + st1 {v10.b}[2], [RGB], #1 + st1 {v11.b}[2], [RGB], #1 + st1 {v12.b}[2], [RGB], #1 + + st1 {v10.b}[3], [RGB], #1 + st1 {v11.b}[3], [RGB], #1 + st1 {v12.b}[3], [RGB], #1 + + st1 {v10.b}[4], [RGB], #1 + st1 {v11.b}[4], [RGB], #1 + st1 {v12.b}[4], [RGB], #1 + + st1 {v10.b}[5], [RGB], #1 + st1 {v11.b}[5], [RGB], #1 + st1 {v12.b}[5], [RGB], #1 + + st1 {v10.b}[6], [RGB], #1 + st1 {v11.b}[6], [RGB], #1 + st1 {v12.b}[6], [RGB], #1 + + st1 {v10.b}[7], [RGB], #1 + st1 {v11.b}[7], [RGB], #1 + st1 {v12.b}[7], [RGB], #1 + .endif + .elseif \size == 4 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 + .elseif \size == 2 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 + .elseif \size == 1 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 + .elseif \size == 4 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 + .elseif \size == 2 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 + .elseif \size == 1 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 16 + .if \size == 8 + st1 {v25.8h}, [RGB], 16 + .elseif \size == 4 + st1 {v25.4h}, [RGB], 8 + .elseif \size == 2 + st1 {v25.h}[4], [RGB], 2 + st1 {v25.h}[5], [RGB], 2 + .elseif \size == 1 + st1 {v25.h}[6], [RGB], 2 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ + g_offs, gsize, b_offs, bsize, \ + defsize, fast_st3 + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + rshrn v20.4h, v20.4s, #15 + rshrn2 v20.8h, v22.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn2 v24.8h, v26.4s, #14 + rshrn v28.4h, v28.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 + sqxtun v1\g_offs\defsize, v20.8h + sqxtun v1\r_offs\defsize, v24.8h + sqxtun v1\b_offs\defsize, v28.8h + .else + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + sri v25.8h, v21.8h, #5 + sri v25.8h, v29.8h, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 + rshrn v20.4h, v20.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn v28.4h, v28.4s, #14 + ld1 {v4.8b}, [U], 8 + rshrn2 v20.8h, v22.4s, #15 + rshrn2 v24.8h, v26.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + ld1 {v5.8b}, [V], 8 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + sqxtun v1\g_offs\defsize, v20.8h + ld1 {v0.8b}, [Y], 8 + sqxtun v1\r_offs\defsize, v24.8h + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sqxtun v1\b_offs\defsize, v28.8h + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + .else /**************************** rgb565 ********************************/ + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + ld1 {v0.8b}, [Y], 8 + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + sri v25.8h, v21.8h, #5 + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sri v25.8h, v29.8h, #11 + .endif + do_store \bpp, 8, \fast_st3 + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +.if \fast_st3 == 1 +asm_function jsimd_ycc_\colorid\()_convert_neon +.else +asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + INPUT_ROW .req w2 + OUTPUT_BUF .req x3 + NUM_ROWS .req w4 + + INPUT_BUF0 .req x5 + INPUT_BUF1 .req x6 + INPUT_BUF2 .req x1 + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w15 + + sub sp, sp, 64 + mov x9, sp + + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ + get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts + + /* Save Neon registers */ + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h}, [x15], 16 + ld1 {v2.8h}, [x15] + + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #8] + ldr INPUT_BUF2, [INPUT_BUF, #16] + .unreq INPUT_BUF + + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ + movi v10.16b, #255 + movi v13.16b, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] + ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + b.lt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 + subs N, N, #8 + b.ge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8, \fast_st3 + tst N, #7 + b.eq 8f +3: + tst N, #4 + b.eq 3f + do_load 4 +3: + tst N, #2 + b.eq 4f + do_load 2 +4: + tst N, #1 + b.eq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + b.eq 6f + do_store \bpp, 4, \fast_st3 +6: + tst N, #2 + b.eq 7f + do_store \bpp, 2, \fast_st3 +7: + tst N, #1 + b.eq 8f + do_store \bpp, 1, \fast_st3 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 + +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + .elseif \size == 4 + st1 {v20.b}[0], [Y], #1 + st1 {v20.b}[1], [Y], #1 + st1 {v20.b}[2], [Y], #1 + st1 {v20.b}[3], [Y], #1 + st1 {v21.b}[0], [U], #1 + st1 {v21.b}[1], [U], #1 + st1 {v21.b}[2], [U], #1 + st1 {v21.b}[3], [U], #1 + st1 {v22.b}[0], [V], #1 + st1 {v22.b}[1], [V], #1 + st1 {v22.b}[2], [V], #1 + st1 {v22.b}[3], [V], #1 + .elseif \size == 2 + st1 {v20.b}[4], [Y], #1 + st1 {v20.b}[5], [Y], #1 + st1 {v21.b}[4], [U], #1 + st1 {v21.b}[5], [U], #1 + st1 {v22.b}[4], [V], #1 + st1 {v22.b}[5], [V], #1 + .elseif \size == 1 + st1 {v20.b}[6], [Y], #1 + st1 {v21.b}[6], [U], #1 + st1 {v22.b}[6], [V], #1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size, fast_ld3 + .if \bpp == 24 + .if \size == 8 + .if \fast_ld3 == 1 + ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 + .else + ld1 {v10.b}[0], [RGB], #1 + ld1 {v11.b}[0], [RGB], #1 + ld1 {v12.b}[0], [RGB], #1 + + ld1 {v10.b}[1], [RGB], #1 + ld1 {v11.b}[1], [RGB], #1 + ld1 {v12.b}[1], [RGB], #1 + + ld1 {v10.b}[2], [RGB], #1 + ld1 {v11.b}[2], [RGB], #1 + ld1 {v12.b}[2], [RGB], #1 + + ld1 {v10.b}[3], [RGB], #1 + ld1 {v11.b}[3], [RGB], #1 + ld1 {v12.b}[3], [RGB], #1 + + ld1 {v10.b}[4], [RGB], #1 + ld1 {v11.b}[4], [RGB], #1 + ld1 {v12.b}[4], [RGB], #1 + + ld1 {v10.b}[5], [RGB], #1 + ld1 {v11.b}[5], [RGB], #1 + ld1 {v12.b}[5], [RGB], #1 + + ld1 {v10.b}[6], [RGB], #1 + ld1 {v11.b}[6], [RGB], #1 + ld1 {v12.b}[6], [RGB], #1 + + ld1 {v10.b}[7], [RGB], #1 + ld1 {v11.b}[7], [RGB], #1 + ld1 {v12.b}[7], [RGB], #1 + .endif + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 + .elseif \size == 2 + ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 + .elseif \size == 1 + ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 + .elseif \size == 2 + ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 + .elseif \size == 1 + ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ + b_offs, fast_ld3 + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ + rev64 v18.4s, v1.4s + rev64 v26.4s, v1.4s + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + umull v14.4s, v4.4h, v0.h[0] + umull2 v16.4s, v4.8h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlsl2 v26.4s, v4.8h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal2 v30.4s, v4.8h, v0.h[5] + umlal v14.4s, v6.4h, v0.h[1] + umlal2 v16.4s, v6.8h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlsl2 v26.4s, v6.8h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl2 v30.4s, v6.8h, v0.h[6] + umlal v14.4s, v8.4h, v0.h[2] + umlal2 v16.4s, v8.8h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + umlal2 v26.4s, v8.8h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl2 v30.4s, v8.8h, v0.h[7] +.endm + +.macro do_rgb_to_yuv_stage2 + rshrn v20.4h, v14.4s, #16 + shrn v22.4h, v18.4s, #16 + shrn v24.4h, v28.4s, #16 + rshrn2 v20.8h, v16.4s, #16 + shrn2 v22.8h, v26.4s, #16 + shrn2 v24.8h, v30.4s, #16 + xtn v20.8b, v20.8h /* v20 = y */ + xtn v21.8b, v22.8h /* v21 = u */ + xtn v22.8b, v24.8h /* v22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +/* TODO: expand macros and interleave instructions if some in-order + * AArch64 processor actually can dual-issue LOAD/STORE with ALU */ +.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 + do_rgb_to_yuv_stage2 + do_load \bpp, 8, \fast_ld3 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + do_rgb_to_yuv_stage1 +.endm + +.if \fast_ld3 == 1 +asm_function jsimd_\colorid\()_ycc_convert_neon +.else +asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + OUTPUT_BUF .req x2 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 + + OUTPUT_BUF0 .req x5 + OUTPUT_BUF1 .req x6 + OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w12 + + /* Load constants to d0, d1, d2, d3 */ + get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts + ld1 {v0.8h, v1.8h}, [x13] + + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] + .unreq OUTPUT_BUF + + /* Save Neon registers */ + sub sp, sp, #64 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load \bpp, 8, \fast_ld3 + do_rgb_to_yuv_stage1 + subs N, N, #8 + b.lt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 + subs N, N, #8 + b.ge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + b.eq 8f +3: + tbz N, #2, 3f + do_load \bpp, 4, \fast_ld3 +3: + tbz N, #1, 4f + do_load \bpp, 2, \fast_ld3 +4: + tbz N, #0, 5f + do_load \bpp, 1, \fast_ld3 +5: + do_rgb_to_yuv + tbz N, #2, 6f + do_store 4 +6: + tbz N, #1, 7f + do_store 2 +7: + tbz N, #0, 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B Fast LD3 */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 + +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_fdct_islow_neon + * + * This file contains a slower but more accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). The following code is based + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for + * more details. + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_fdct_islow_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts + ld1 {v0.8h, v1.8h}, [TMP] + + /* Save Neon registers */ + sub sp, sp, #64 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 + + /* Load all DATA into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v16.8h + * 1 | d18 | d19 | v17.8h + * 2 | d20 | d21 | v18.8h + * 3 | d22 | d23 | v19.8h + * 4 | d24 | d25 | v20.8h + * 5 | d26 | d27 | v21.8h + * 6 | d28 | d29 | v22.8h + * 7 | d30 | d31 | v23.8h + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + sub DATA, DATA, #64 + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P1 + rshrn v22.4h, v22.4s, #DESCALE_P1 + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s /* z3 += z5 */ + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s /* z4 += z5 */ + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P1 + rshrn v21.4h, v29.4s, #DESCALE_P1 + rshrn v19.4h, v30.4s, #DESCALE_P1 + rshrn v17.4h, v31.4s, #DESCALE_P1 + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P2 + rshrn v22.4h, v22.4s, #DESCALE_P2 + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P2 + rshrn v21.4h, v29.4s, #DESCALE_P2 + rshrn v19.4h, v30.4s, #DESCALE_P2 + rshrn v17.4h, v31.4s, #DESCALE_P2 + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + /* Restore Neon registers */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + br x30 + + .unreq DATA + .unreq TMP + +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET *) + * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + + BUFFER .req x1 + PUT_BUFFER .req x6 + PUT_BITS .req x7 + PUT_BITSw .req w7 + +.macro emit_byte + sub PUT_BITS, PUT_BITS, #0x8 + lsr x19, PUT_BUFFER, PUT_BITS + uxtb w19, w19 + strb w19, [BUFFER, #1]! + cmp w19, #0xff + b.ne 14f + strb wzr, [BUFFER, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl PUT_BUFFER, PUT_BUFFER, \SIZE + add PUT_BITS, PUT_BITS, \SIZE + orr PUT_BUFFER, PUT_BUFFER, \CODE +.endm +.macro checkbuf31 + cmp PUT_BITS, #0x20 + b.lt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp PUT_BITS, #0x30 + b.lt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.macro generate_jsimd_huff_encode_one_block fast_tbl + +.if \fast_tbl == 1 +asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif + sub sp, sp, 272 + sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ + /* Save Arm registers */ + stp x19, x20, [sp] + get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts + ldr PUT_BUFFER, [x0, #0x10] + ldr PUT_BITSw, [x0, #0x18] + ldrsh w12, [x2] /* load DC coeff in w12 */ + /* prepare data */ +.if \fast_tbl == 1 + ld1 {v23.16b}, [x15], #16 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + /* ZigZag 8x8 */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b +.else + add x13, x2, #0x22 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + ld1 {v23.16b}, [x15] + add x14, x2, #0x18 + add x3, x2, #0x36 + ins v0.h[0], w12 + add x9, x2, #0x2 + ld1 {v1.h}[0], [x13] + add x15, x2, #0x30 + ld1 {v2.h}[0], [x14] + add x19, x2, #0x26 + ld1 {v3.h}[0], [x3] + add x20, x2, #0x28 + ld1 {v0.h}[1], [x9] + add x12, x2, #0x10 + ld1 {v1.h}[1], [x15] + add x13, x2, #0x40 + ld1 {v2.h}[1], [x19] + add x14, x2, #0x34 + ld1 {v3.h}[1], [x20] + add x3, x2, #0x1a + ld1 {v0.h}[2], [x12] + add x9, x2, #0x20 + ld1 {v1.h}[2], [x13] + add x15, x2, #0x32 + ld1 {v2.h}[2], [x14] + add x19, x2, #0x42 + ld1 {v3.h}[2], [x3] + add x20, x2, #0xc + ld1 {v0.h}[3], [x9] + add x12, x2, #0x12 + ld1 {v1.h}[3], [x15] + add x13, x2, #0x24 + ld1 {v2.h}[3], [x19] + add x14, x2, #0x50 + ld1 {v3.h}[3], [x20] + add x3, x2, #0xe + ld1 {v0.h}[4], [x12] + add x9, x2, #0x4 + ld1 {v1.h}[4], [x13] + add x15, x2, #0x16 + ld1 {v2.h}[4], [x14] + add x19, x2, #0x60 + ld1 {v3.h}[4], [x3] + add x20, x2, #0x1c + ld1 {v0.h}[5], [x9] + add x12, x2, #0x6 + ld1 {v1.h}[5], [x15] + add x13, x2, #0x8 + ld1 {v2.h}[5], [x19] + add x14, x2, #0x52 + ld1 {v3.h}[5], [x20] + add x3, x2, #0x2a + ld1 {v0.h}[6], [x12] + add x9, x2, #0x14 + ld1 {v1.h}[6], [x13] + add x15, x2, #0xa + ld1 {v2.h}[6], [x14] + add x19, x2, #0x44 + ld1 {v3.h}[6], [x3] + add x20, x2, #0x38 + ld1 {v0.h}[7], [x9] + add x12, x2, #0x46 + ld1 {v1.h}[7], [x15] + add x13, x2, #0x3a + ld1 {v2.h}[7], [x19] + add x14, x2, #0x74 + ld1 {v3.h}[7], [x20] + add x3, x2, #0x6a + ld1 {v4.h}[0], [x12] + add x9, x2, #0x54 + ld1 {v5.h}[0], [x13] + add x15, x2, #0x2c + ld1 {v6.h}[0], [x14] + add x19, x2, #0x76 + ld1 {v7.h}[0], [x3] + add x20, x2, #0x78 + ld1 {v4.h}[1], [x9] + add x12, x2, #0x62 + ld1 {v5.h}[1], [x15] + add x13, x2, #0x1e + ld1 {v6.h}[1], [x19] + add x14, x2, #0x68 + ld1 {v7.h}[1], [x20] + add x3, x2, #0x7a + ld1 {v4.h}[2], [x12] + add x9, x2, #0x70 + ld1 {v5.h}[2], [x13] + add x15, x2, #0x2e + ld1 {v6.h}[2], [x14] + add x19, x2, #0x5a + ld1 {v7.h}[2], [x3] + add x20, x2, #0x6c + ld1 {v4.h}[3], [x9] + add x12, x2, #0x72 + ld1 {v5.h}[3], [x15] + add x13, x2, #0x3c + ld1 {v6.h}[3], [x19] + add x14, x2, #0x4c + ld1 {v7.h}[3], [x20] + add x3, x2, #0x5e + ld1 {v4.h}[4], [x12] + add x9, x2, #0x64 + ld1 {v5.h}[4], [x13] + add x15, x2, #0x4a + ld1 {v6.h}[4], [x14] + add x19, x2, #0x3e + ld1 {v7.h}[4], [x3] + add x20, x2, #0x6e + ld1 {v4.h}[5], [x9] + add x12, x2, #0x56 + ld1 {v5.h}[5], [x15] + add x13, x2, #0x58 + ld1 {v6.h}[5], [x19] + add x14, x2, #0x4e + ld1 {v7.h}[5], [x20] + add x3, x2, #0x7c + ld1 {v4.h}[6], [x12] + add x9, x2, #0x48 + ld1 {v5.h}[6], [x13] + add x15, x2, #0x66 + ld1 {v6.h}[6], [x14] + add x19, x2, #0x5c + ld1 {v7.h}[6], [x3] + add x20, x2, #0x7e + ld1 {v4.h}[7], [x9] + ld1 {v5.h}[7], [x15] + ld1 {v6.h}[7], [x19] + ld1 {v7.h}[7], [x20] +.endif + cmlt v24.8h, v0.8h, #0 + cmlt v25.8h, v1.8h, #0 + cmlt v26.8h, v2.8h, #0 + cmlt v27.8h, v3.8h, #0 + cmlt v28.8h, v4.8h, #0 + cmlt v29.8h, v5.8h, #0 + cmlt v30.8h, v6.8h, #0 + cmlt v31.8h, v7.8h, #0 + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + cmeq v16.8h, v0.8h, #0 + cmeq v17.8h, v1.8h, #0 + cmeq v18.8h, v2.8h, #0 + cmeq v19.8h, v3.8h, #0 + cmeq v20.8h, v4.8h, #0 + cmeq v21.8h, v5.8h, #0 + cmeq v22.8h, v6.8h, #0 + xtn v16.8b, v16.8h + xtn v18.8b, v18.8h + xtn v20.8b, v20.8h + xtn v22.8b, v22.8h + umov w14, v0.h[0] + xtn2 v16.16b, v17.8h + umov w13, v24.h[0] + xtn2 v18.16b, v19.8h + clz w14, w14 + xtn2 v20.16b, v21.8h + lsl w13, w13, w14 + cmeq v17.8h, v7.8h, #0 + sub w12, w14, #32 + xtn2 v22.16b, v17.8h + lsr w13, w13, w14 + and v16.16b, v16.16b, v23.16b + neg w12, w12 + and v18.16b, v18.16b, v23.16b + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + and v20.16b, v20.16b, v23.16b + add x15, sp, #0x90 /* x15 = t2 */ + and v22.16b, v22.16b, v23.16b + ldr w10, [x4, x12, lsl #2] + addp v16.16b, v16.16b, v18.16b + ldrb w11, [x3, x12] + addp v20.16b, v20.16b, v22.16b + checkbuf47 + addp v16.16b, v16.16b, v20.16b + put_bits x10, x11 + addp v16.16b, v16.16b, v18.16b + checkbuf47 + umov x9, v16.D[0] + put_bits x13, x12 + cnt v17.8b, v16.8b + mvn x9, x9 + addv B18, v17.8b + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + umov w12, v18.b[0] + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp w12, #(64-8) + add x11, sp, #16 + b.lt 4f + cbz x9, 6f + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w20, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + clz w20, w20 + ldrh w3, [x15, #2]! + sub w11, w20, #32 + lsl w3, w3, w20 + neg w11, w11 + lsr w3, w3, w20 + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b + b 6f +4: + movi v21.8h, #0x0010 + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + neg v0.8h, v0.8h + neg v1.8h, v1.8h + neg v2.8h, v2.8h + neg v3.8h, v3.8h + neg v4.8h, v4.8h + neg v5.8h, v5.8h + neg v6.8h, v6.8h + neg v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + add v0.8h, v21.8h, v0.8h + add v1.8h, v21.8h, v1.8h + add v2.8h, v21.8h, v2.8h + add v3.8h, v21.8h, v3.8h + add v4.8h, v21.8h, v4.8h + add v5.8h, v21.8h, v5.8h + add v6.8h, v21.8h, v6.8h + add v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + ldrh w3, [x15, #2]! + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0x10e + cmp x15, x13 + b.hs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + str PUT_BUFFER, [x0, #0x10] + str PUT_BITSw, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, BUFFER, #0x1 + add sp, sp, 256 + br x30 + +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + + .unreq BUFFER + .unreq PUT_BUFFER + .unreq PUT_BITS + .unreq PUT_BITSw + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/align.h b/3rdparty/libjpeg-turbo/src/simd/arm/align.h new file mode 100644 index 0000000000..cff4241e84 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/align.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* How to obtain memory alignment for structures and variables */ +#if defined(_MSC_VER) +#define ALIGN(alignment) __declspec(align(alignment)) +#elif defined(__clang__) || defined(__GNUC__) +#define ALIGN(alignment) __attribute__((aligned(alignment))) +#else +#error "Unknown compiler" +#endif diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c new file mode 100644 index 0000000000..9fcc62dd25 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c @@ -0,0 +1,160 @@ +/* + * jccolor-neon.c - colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" +#include "neon-compat.h" + +#include + + +/* RGB -> YCbCr conversion constants */ + +#define F_0_298 19595 +#define F_0_587 38470 +#define F_0_113 7471 +#define F_0_168 11059 +#define F_0_331 21709 +#define F_0_500 32768 +#define F_0_418 27439 +#define F_0_081 5329 + +ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { + F_0_298, F_0_587, F_0_113, F_0_168, + F_0_331, F_0_500, F_0_418, F_0_081 +}; + + +/* Include inline routines for colorspace extensions. */ + +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon +#if defined(__aarch64__) || defined(_M_ARM64) +#include "aarch64/jccolext-neon.c" +#else +#include "aarch32/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c new file mode 100644 index 0000000000..71c7b2de21 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c @@ -0,0 +1,120 @@ +/* + * jcgray-neon.c - grayscale colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* RGB -> Grayscale conversion constants */ + +#define F_0_298 19595 +#define F_0_587 38470 +#define F_0_113 7471 + + +/* Include inline routines for colorspace extensions. */ + +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c new file mode 100644 index 0000000000..416a7385df --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c @@ -0,0 +1,106 @@ +/* + * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-neon.c */ + + +/* RGB -> Grayscale conversion is defined by the following equation: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * These constants are defined in jcgray-neon.c + * + * This is the same computation as the RGB -> Y portion of RGB -> YCbCr. + */ + +void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + JSAMPROW inptr; + JSAMPROW outptr; + /* Allocate temporary buffer for final (image_width % 16) pixels in row. */ + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining > 0; cols_remaining -= 16) { + + /* To prevent buffer overread by the vector load instructions, the last + * (image_width % 16) columns of data are first memcopied to a temporary + * buffer large enough to accommodate the vector load. + */ + if (cols_remaining < 16) { + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + } + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298); + uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298); + uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298); + uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298); + y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587); + y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587); + y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587); + y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587); + y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113); + y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113); + y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113); + y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + + /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is + * permitted up to the next multiple of ALIGN_SIZE bytes. + */ + vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + + /* Increment pointers. */ + inptr += (16 * RGB_PIXELSIZE); + outptr += 16; + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h b/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h new file mode 100644 index 0000000000..2fbd252b9b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h @@ -0,0 +1,131 @@ +/* + * jchuff.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009, 2018, 2021, D. R. Commander. + * Copyright (C) 2018, Matthias Räncker. + * Copyright (C) 2020-2021, Arm Limited. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +/* Expanded entropy encoder object for Huffman encoding. + * + * The savable_state subrecord contains fields that change within an MCU, + * but must not be updated permanently until we complete the MCU. + */ + +#if defined(__aarch64__) || defined(_M_ARM64) +#define BIT_BUF_SIZE 64 +#else +#define BIT_BUF_SIZE 32 +#endif + +typedef struct { + size_t put_buffer; /* current bit accumulation buffer */ + int free_bits; /* # of bits available in it */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ +} savable_state; + +typedef struct { + JOCTET *next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + savable_state cur; /* Current bit buffer & DC state */ + j_compress_ptr cinfo; /* dump_buffer needs access to this */ + int simd; +} working_state; + +/* Outputting bits to the file */ + +/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded + * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is + * 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the + * speculative 0 byte will be overwritten by the next byte. + */ +#define EMIT_BYTE(b) { \ + buffer[0] = (JOCTET)(b); \ + buffer[1] = 0; \ + buffer -= -2 + ((JOCTET)(b) < 0xFF); \ +} + +/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write + * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to + * encode 0xFF as 0xFF 0x00. + */ +#if defined(__aarch64__) || defined(_M_ARM64) + +#define FLUSH() { \ + if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ + EMIT_BYTE(put_buffer >> 56) \ + EMIT_BYTE(put_buffer >> 48) \ + EMIT_BYTE(put_buffer >> 40) \ + EMIT_BYTE(put_buffer >> 32) \ + EMIT_BYTE(put_buffer >> 24) \ + EMIT_BYTE(put_buffer >> 16) \ + EMIT_BYTE(put_buffer >> 8) \ + EMIT_BYTE(put_buffer ) \ + } else { \ + *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \ + buffer += 8; \ + } \ +} + +#else + +#if defined(_MSC_VER) && !defined(__clang__) +#define SPLAT() { \ + buffer[0] = (JOCTET)(put_buffer >> 24); \ + buffer[1] = (JOCTET)(put_buffer >> 16); \ + buffer[2] = (JOCTET)(put_buffer >> 8); \ + buffer[3] = (JOCTET)(put_buffer ); \ + buffer += 4; \ +} +#else +#define SPLAT() { \ + put_buffer = __builtin_bswap32(put_buffer); \ + __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \ +} +#endif + +#define FLUSH() { \ + if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \ + EMIT_BYTE(put_buffer >> 24) \ + EMIT_BYTE(put_buffer >> 16) \ + EMIT_BYTE(put_buffer >> 8) \ + EMIT_BYTE(put_buffer ) \ + } else { \ + SPLAT(); \ + } \ +} + +#endif + +/* Fill the bit buffer to capacity with the leading bits from code, then output + * the bit buffer and put the remaining bits from code into the bit buffer. + */ +#define PUT_AND_FLUSH(code, size) { \ + put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \ + FLUSH() \ + free_bits += BIT_BUF_SIZE; \ + put_buffer = code; \ +} + +/* Insert code into the bit buffer and output the bit buffer if needed. + * NOTE: We can't flush with free_bits == 0, since the left shift in + * PUT_AND_FLUSH() would have undefined behavior. + */ +#define PUT_BITS(code, size) { \ + free_bits -= size; \ + if (free_bits < 0) \ + PUT_AND_FLUSH(code, size) \ + else \ + put_buffer = (put_buffer << size) | code; \ +} + +#define PUT_CODE(code, size, diff) { \ + diff |= code << nbits; \ + nbits += size; \ + PUT_BITS(diff, nbits) \ +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c new file mode 100644 index 0000000000..b91c5db478 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c @@ -0,0 +1,622 @@ +/* + * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) + * + * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "jconfigint.h" +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "neon-compat.h" + +#include + + +/* Data preparation for encode_mcu_AC_first(). + * + * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be + * found in jcphuff.c. + */ + +void jsimd_encode_mcu_AC_first_prepare_neon + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits) +{ + JCOEF *values_ptr = values; + JCOEF *diff_values_ptr = values + DCTSIZE2; + + /* Rows of coefficients to zero (since they haven't been processed) */ + int i, rows_to_zero = 8; + + for (i = 0; i < Sl / 16; i++) { + int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); + int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); + + /* Isolate sign of coefficients. */ + int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); + int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs1 = vabsq_s16(coefs1); + int16x8_t abs_coefs2 = vabsq_s16(coefs2); + coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); + coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + + /* Compute diff values. */ + int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); + int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + + /* Store transformed coefficients and diff values. */ + vst1q_s16(values_ptr, coefs1); + vst1q_s16(values_ptr + DCTSIZE, coefs2); + vst1q_s16(diff_values_ptr, diff1); + vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + values_ptr += 16; + diff_values_ptr += 16; + jpeg_natural_order_start += 16; + rows_to_zero -= 2; + } + + /* Same operation but for remaining partial vector */ + int remaining_coefs = Sl % 16; + if (remaining_coefs > 8) { + int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); + int16x8_t coefs2 = vdupq_n_s16(0); + switch (remaining_coefs) { + case 15: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } + + /* Isolate sign of coefficients. */ + int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); + int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs1 = vabsq_s16(coefs1); + int16x8_t abs_coefs2 = vabsq_s16(coefs2); + coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); + coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + + /* Compute diff values. */ + int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); + int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + + /* Store transformed coefficients and diff values. */ + vst1q_s16(values_ptr, coefs1); + vst1q_s16(values_ptr + DCTSIZE, coefs2); + vst1q_s16(diff_values_ptr, diff1); + vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + values_ptr += 16; + diff_values_ptr += 16; + rows_to_zero -= 2; + + } else if (remaining_coefs > 0) { + int16x8_t coefs = vdupq_n_s16(0); + + switch (remaining_coefs) { + case 8: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); + FALLTHROUGH /*FALLTHROUGH*/ + case 7: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } + + /* Isolate sign of coefficients. */ + int16x8_t sign_coefs = vshrq_n_s16(coefs, 15); + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs = vabsq_s16(coefs); + coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); + + /* Compute diff values. */ + int16x8_t diff = veorq_s16(coefs, sign_coefs); + + /* Store transformed coefficients and diff values. */ + vst1q_s16(values_ptr, coefs); + vst1q_s16(diff_values_ptr, diff); + values_ptr += 8; + diff_values_ptr += 8; + rows_to_zero--; + } + + /* Zero remaining memory in the values and diff_values blocks. */ + for (i = 0; i < rows_to_zero; i++) { + vst1q_s16(values_ptr, vdupq_n_s16(0)); + vst1q_s16(diff_values_ptr, vdupq_n_s16(0)); + values_ptr += 8; + diff_values_ptr += 8; + } + + /* Construct zerobits bitmap. A set bit means that the corresponding + * coefficient != 0. + */ + int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE); + int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE); + int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE); + int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE); + int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE); + int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE); + int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE); + + uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0))); + uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0))); + uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0))); + uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0))); + uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0))); + uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0))); + uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); + uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); + + /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ + const uint8x8_t bitmap_mask = + vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); + + row0_eq0 = vand_u8(row0_eq0, bitmap_mask); + row1_eq0 = vand_u8(row1_eq0, bitmap_mask); + row2_eq0 = vand_u8(row2_eq0, bitmap_mask); + row3_eq0 = vand_u8(row3_eq0, bitmap_mask); + row4_eq0 = vand_u8(row4_eq0, bitmap_mask); + row5_eq0 = vand_u8(row5_eq0, bitmap_mask); + row6_eq0 = vand_u8(row6_eq0, bitmap_mask); + row7_eq0 = vand_u8(row7_eq0, bitmap_mask); + + uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0); + uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0); + uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0); + uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0); + uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); + uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); + uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* Move bitmap to a 64-bit scalar register. */ + uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); + /* Store zerobits bitmap. */ + *zerobits = ~bitmap; +#else + /* Move bitmap to two 32-bit scalar registers. */ + uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); + uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); + /* Store zerobits bitmap. */ + zerobits[0] = ~bitmap0; + zerobits[1] = ~bitmap1; +#endif +} + + +/* Data preparation for encode_mcu_AC_refine(). + * + * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be + * found in jcphuff.c. + */ + +int jsimd_encode_mcu_AC_refine_prepare_neon + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits) +{ + /* Temporary storage buffers for data used to compute the signbits bitmap and + * the end-of-block (EOB) position + */ + uint8_t coef_sign_bits[64]; + uint8_t coef_eq1_bits[64]; + + JCOEF *absvalues_ptr = absvalues; + uint8_t *coef_sign_bits_ptr = coef_sign_bits; + uint8_t *eq1_bits_ptr = coef_eq1_bits; + + /* Rows of coefficients to zero (since they haven't been processed) */ + int i, rows_to_zero = 8; + + for (i = 0; i < Sl / 16; i++) { + int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); + int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); + + /* Compute and store data for signbits bitmap. */ + uint8x8_t sign_coefs1 = + vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); + uint8x8_t sign_coefs2 = + vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); + vst1_u8(coef_sign_bits_ptr, sign_coefs1); + vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); + + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs1 = vabsq_s16(coefs1); + int16x8_t abs_coefs2 = vabsq_s16(coefs2); + coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); + coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_s16(absvalues_ptr, coefs1); + vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + + /* Test whether transformed coefficient values == 1 (used to find EOB + * position.) + */ + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + vst1_u8(eq1_bits_ptr, coefs_eq11); + vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); + + absvalues_ptr += 16; + coef_sign_bits_ptr += 16; + eq1_bits_ptr += 16; + jpeg_natural_order_start += 16; + rows_to_zero -= 2; + } + + /* Same operation but for remaining partial vector */ + int remaining_coefs = Sl % 16; + if (remaining_coefs > 8) { + int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); + coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); + int16x8_t coefs2 = vdupq_n_s16(0); + switch (remaining_coefs) { + case 15: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } + + /* Compute and store data for signbits bitmap. */ + uint8x8_t sign_coefs1 = + vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); + uint8x8_t sign_coefs2 = + vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); + vst1_u8(coef_sign_bits_ptr, sign_coefs1); + vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); + + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs1 = vabsq_s16(coefs1); + int16x8_t abs_coefs2 = vabsq_s16(coefs2); + coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); + coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_s16(absvalues_ptr, coefs1); + vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + + /* Test whether transformed coefficient values == 1 (used to find EOB + * position.) + */ + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + vst1_u8(eq1_bits_ptr, coefs_eq11); + vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); + + absvalues_ptr += 16; + coef_sign_bits_ptr += 16; + eq1_bits_ptr += 16; + jpeg_natural_order_start += 16; + rows_to_zero -= 2; + + } else if (remaining_coefs > 0) { + int16x8_t coefs = vdupq_n_s16(0); + + switch (remaining_coefs) { + case 8: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); + FALLTHROUGH /*FALLTHROUGH*/ + case 7: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } + + /* Compute and store data for signbits bitmap. */ + uint8x8_t sign_coefs = + vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15))); + vst1_u8(coef_sign_bits_ptr, sign_coefs); + + /* Compute absolute value of coefficients and apply point transform Al. */ + int16x8_t abs_coefs = vabsq_s16(coefs); + coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); + vst1q_s16(absvalues_ptr, coefs); + + /* Test whether transformed coefficient values == 1 (used to find EOB + * position.) + */ + uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1))); + vst1_u8(eq1_bits_ptr, coefs_eq1); + + absvalues_ptr += 8; + coef_sign_bits_ptr += 8; + eq1_bits_ptr += 8; + rows_to_zero--; + } + + /* Zero remaining memory in blocks. */ + for (i = 0; i < rows_to_zero; i++) { + vst1q_s16(absvalues_ptr, vdupq_n_s16(0)); + vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); + vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); + absvalues_ptr += 8; + coef_sign_bits_ptr += 8; + eq1_bits_ptr += 8; + } + + /* Construct zerobits bitmap. */ + int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE); + int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE); + int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE); + int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE); + int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE); + int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE); + int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE); + int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE); + + uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0))); + uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0))); + uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0))); + uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0))); + uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0))); + uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0))); + uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); + uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); + + /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ + const uint8x8_t bitmap_mask = + vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); + + abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask); + abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask); + abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask); + abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask); + abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask); + abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask); + abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask); + abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask); + + uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0); + uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0); + uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0); + uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0); + uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); + uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); + uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* Move bitmap to a 64-bit scalar register. */ + uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); + /* Store zerobits bitmap. */ + bits[0] = ~bitmap; +#else + /* Move bitmap to two 32-bit scalar registers. */ + uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); + uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); + /* Store zerobits bitmap. */ + bits[0] = ~bitmap0; + bits[1] = ~bitmap1; +#endif + + /* Construct signbits bitmap. */ + uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE); + uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE); + uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE); + uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE); + uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE); + uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE); + uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE); + uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE); + + signbits_row0 = vand_u8(signbits_row0, bitmap_mask); + signbits_row1 = vand_u8(signbits_row1, bitmap_mask); + signbits_row2 = vand_u8(signbits_row2, bitmap_mask); + signbits_row3 = vand_u8(signbits_row3, bitmap_mask); + signbits_row4 = vand_u8(signbits_row4, bitmap_mask); + signbits_row5 = vand_u8(signbits_row5, bitmap_mask); + signbits_row6 = vand_u8(signbits_row6, bitmap_mask); + signbits_row7 = vand_u8(signbits_row7, bitmap_mask); + + bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1); + bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3); + bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5); + bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7); + bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); + bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); + bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* Move bitmap to a 64-bit scalar register. */ + bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); + /* Store signbits bitmap. */ + bits[1] = ~bitmap; +#else + /* Move bitmap to two 32-bit scalar registers. */ + bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); + bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); + /* Store signbits bitmap. */ + bits[2] = ~bitmap0; + bits[3] = ~bitmap1; +#endif + + /* Construct bitmap to find EOB position (the index of the last coefficient + * equal to 1.) + */ + uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE); + uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE); + uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE); + uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE); + uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE); + uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE); + uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE); + uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE); + + row0_eq1 = vand_u8(row0_eq1, bitmap_mask); + row1_eq1 = vand_u8(row1_eq1, bitmap_mask); + row2_eq1 = vand_u8(row2_eq1, bitmap_mask); + row3_eq1 = vand_u8(row3_eq1, bitmap_mask); + row4_eq1 = vand_u8(row4_eq1, bitmap_mask); + row5_eq1 = vand_u8(row5_eq1, bitmap_mask); + row6_eq1 = vand_u8(row6_eq1, bitmap_mask); + row7_eq1 = vand_u8(row7_eq1, bitmap_mask); + + bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1); + bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1); + bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1); + bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1); + bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); + bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); + bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* Move bitmap to a 64-bit scalar register. */ + bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); + + /* Return EOB position. */ + if (bitmap == 0) { + /* EOB position is defined to be 0 if all coefficients != 1. */ + return 0; + } else { + return 63 - BUILTIN_CLZLL(bitmap); + } +#else + /* Move bitmap to two 32-bit scalar registers. */ + bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); + bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); + + /* Return EOB position. */ + if (bitmap0 == 0 && bitmap1 == 0) { + return 0; + } else if (bitmap1 != 0) { + return 63 - BUILTIN_CLZ(bitmap1); + } else { + return 31 - BUILTIN_CLZ(bitmap0); + } +#endif +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c new file mode 100644 index 0000000000..8a3e237838 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c @@ -0,0 +1,192 @@ +/* + * jcsample-neon.c - downsampling (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + + +/* Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + */ + +void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + JSAMPROW inptr, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = + vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern (alternating every pixel.) */ + /* { 0, 1, 0, 1, 0, 1, 0, 1 } */ + const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000)); + unsigned i, outrow; + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + /* Downsample all but the last DCT block of pixels. */ + for (i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE); + /* Add adjacent pixel values, widen to 16-bit, and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + /* Store samples to memory. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) || defined(_M_ARM64) + /* Pad the empty elements with the value of the last pixel. */ + pixels = vqtbl1q_u8(pixels, expand_mask); +#else + uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } }; + pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)), + vtbl2_u8(table, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values, widen to 16-bit, and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2, narrow to 8-bit, and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} + + +/* Downsample pixel values of a single component. + * This version handles the standard case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + */ + +void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + JSAMPROW inptr0, inptr1, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = + vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern (alternating every pixel.) */ + /* { 1, 2, 1, 2, 1, 2, 1, 2 } */ + const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001)); + unsigned i, outrow; + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr0 = input_data[outrow]; + inptr1 = input_data[outrow + 1]; + + /* Downsample all but the last DCT block of pixels. */ + for (i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE); + uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE); + /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. + */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + /* Store samples to memory and increment pointers. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels_r0 = + vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE); + uint8x16_t pixels_r1 = + vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) || defined(_M_ARM64) + /* Pad the empty elements with the value of the last pixel. */ + pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask); + pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask); +#else + uint8x8x2_t table_r0 = + { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } }; + uint8x8x2_t table_r1 = + { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } }; + pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)), + vtbl2_u8(table_r0, vget_high_u8(expand_mask))); + pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)), + vtbl2_u8(table_r1, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4, narrow to 8-bit, and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c new file mode 100644 index 0000000000..c3c07a1964 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c @@ -0,0 +1,374 @@ +/* + * jdcolext-neon.c - colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-neon.c. */ + + +/* YCbCr -> RGB conversion is defined by the following equations: + * R = Y + 1.40200 * (Cr - 128) + * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) + * B = Y + 1.77200 * (Cb - 128) + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.3441467 = 11277 * 2^-15 + * 0.7141418 = 23401 * 2^-15 + * 1.4020386 = 22971 * 2^-14 + * 1.7720337 = 29033 * 2^-14 + * These constants are defined in jdcolor-neon.c. + * + * To ensure correct results, rounding is used when descaling. + */ + +/* Notes on safe memory access for YCbCr -> RGB conversion routines: + * + * Input memory buffers can be safely overread up to the next multiple of + * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in + * jmemmgr.c. + * + * The output buffer cannot safely be written beyond output_width, since + * output_buf points to a possibly unpadded row in the decompressed image + * buffer allocated by the calling program. + */ + +void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr; + /* Pointers to Y, Cb, and Cr data */ + JSAMPROW inptr0, inptr1, inptr2; + + const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); + const int16x8_t neg_128 = vdupq_n_s16(-128); + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + uint8x16_t y = vld1q_u8(inptr0); + uint8x16_t cb = vld1q_u8(inptr1); + uint8x16_t cr = vld1q_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_low_u8(cr))); + int16x8_t cr_128_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_high_u8(cr))); + int16x8_t cb_128_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_low_u8(cb))); + int16x8_t cb_128_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), + vget_high_u8(cb))); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0); + int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l), + consts, 0); + int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0); + int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h), + consts, 0); + g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l), + consts, 1); + g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l), + consts, 1); + g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h), + consts, 1); + g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h), + consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15), + vrshrn_n_s32(g_sub_y_lh, 15)); + int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15), + vrshrn_n_s32(g_sub_y_hh, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1), + consts, 2); + int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1), + consts, 3); + int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1), + consts, 3); + /* Add Y. */ + int16x8_t r_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), + vget_low_u8(y))); + int16x8_t r_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), + vget_high_u8(y))); + int16x8_t b_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), + vget_low_u8(y))); + int16x8_t b_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), + vget_high_u8(y))); + int16x8_t g_l = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), + vget_low_u8(y))); + int16x8_t g_h = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), + vget_high_u8(y))); + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr, rgba); +#elif RGB_PIXELSIZE == 3 + uint8x16x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr, rgb); +#else + /* Pack R, G, and B values in ratio 5:6:5. */ + uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8); + rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5); + rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11); + uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8); + rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5); + rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11); + /* Store RGB pixel data to memory. */ + vst1q_u16((uint16_t *)outptr, rgb565_l); + vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h); +#endif + + /* Increment pointers. */ + inptr0 += 16; + inptr1 += 16; + inptr2 += 16; + outptr += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining >= 8) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), + consts, 3); + /* Add Y. */ + int16x8_t r = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4_u8(outptr, rgba); +#elif RGB_PIXELSIZE == 3 + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + vst3_u8(outptr, rgb); +#else + /* Pack R, G, and B values in ratio 5:6:5. */ + uint16x8_t rgb565 = vqshluq_n_s16(r, 8); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); + /* Store RGB pixel data to memory. */ + vst1q_u16((uint16_t *)outptr, rgb565); +#endif + + /* Increment pointers. */ + inptr0 += 8; + inptr1 += 8; + inptr2 += 8; + outptr += (RGB_PIXELSIZE * 8); + cols_remaining -= 8; + } + + /* Handle the tail elements. */ + if (cols_remaining > 0) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), + consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), + consts, 3); + /* Add Y. */ + int16x8_t r = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 7: + vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst4_lane_u8(outptr, rgba, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#elif RGB_PIXELSIZE == 3 + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 7: + vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst3_lane_u8(outptr, rgb, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#else + /* Pack R, G, and B values in ratio 5:6:5. */ + uint16x8_t rgb565 = vqshluq_n_s16(r, 8); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); + /* Store RGB565 pixel data to memory. */ + switch (cols_remaining) { + case 7: + vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst1q_lane_u16((uint16_t *)outptr, rgb565, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#endif + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c new file mode 100644 index 0000000000..ea4668f1d3 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c @@ -0,0 +1,142 @@ +/* + * jdcolor-neon.c - colorspace conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "jconfigint.h" +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* YCbCr -> RGB conversion constants */ + +#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */ +#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */ +#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */ +#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */ + +ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = { + -F_0_344, F_0_714, F_1_402, F_1_772 +}; + + +/* Include inline routines for colorspace extensions. */ + +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +/* YCbCr -> RGB565 Conversion */ + +#define RGB_PIXELSIZE 2 +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon +#include "jdcolext-neon.c" +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c new file mode 100644 index 0000000000..e4f91fdc0e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c @@ -0,0 +1,145 @@ +/* + * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "jconfigint.h" +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* YCbCr -> RGB conversion constants */ + +#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */ +#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */ +#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */ +#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */ + +ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = { + -F_0_344, F_0_714, F_1_402, F_1_772 +}; + + +/* Include inline routines for colorspace extensions. */ + +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c new file mode 100644 index 0000000000..5b89bdb339 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c @@ -0,0 +1,723 @@ +/* + * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-neon.c. */ + + +/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2 + * chroma upsampling and YCbCr -> RGB color conversion into a single function. + * + * As with the standalone functions, YCbCr -> RGB conversion is defined by the + * following equations: + * R = Y + 1.40200 * (Cr - 128) + * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) + * B = Y + 1.77200 * (Cb - 128) + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.3441467 = 11277 * 2^-15 + * 0.7141418 = 23401 * 2^-15 + * 1.4020386 = 22971 * 2^-14 + * 1.7720337 = 29033 * 2^-14 + * These constants are defined in jdmerge-neon.c. + * + * To ensure correct results, rounding is used when descaling. + */ + +/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion + * routines: + * + * Input memory buffers can be safely overread up to the next multiple of + * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in + * jmemmgr.c. + * + * The output buffer cannot safely be written beyond output_width, since + * output_buf points to a possibly unpadded row in the decompressed image + * buffer allocated by the calling program. + */ + +/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. + */ + +void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr; + /* Pointers to Y, Cb, and Cr data */ + JSAMPROW inptr0, inptr1, inptr2; + + const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); + const int16x8_t neg_128 = vdupq_n_s16(-128); + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + /* De-interleave Y component values into two separate vectors, one + * containing the component values with even-numbered indices and one + * containing the component values with odd-numbered indices. + */ + uint8x8x2_t y = vld2_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); + /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and + * "odd" Y component values. This effectively upsamples the chroma + * components horizontally. + */ + int16x8_t g_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y.val[0])); + int16x8_t r_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y.val[0])); + int16x8_t b_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y.val[0])); + int16x8_t g_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y.val[1])); + int16x8_t r_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y.val[1])); + int16x8_t b_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. + * Re-interleave the "even" and "odd" component values. + */ + uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); + uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); + uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); + +#ifdef RGB_ALPHA + uint8x16x4_t rgba; + rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); + rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); + rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr, rgba); +#else + uint8x16x3_t rgb; + rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); + rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); + rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr, rgb); +#endif + + /* Increment pointers. */ + inptr0 += 16; + inptr1 += 8; + inptr2 += 8; + outptr += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining > 0) { + /* De-interleave Y component values into two separate vectors, one + * containing the component values with even-numbered indices and one + * containing the component values with odd-numbered indices. + */ + uint8x8x2_t y = vld2_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); + /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and + * "odd" Y component values. This effectively upsamples the chroma + * components horizontally. + */ + int16x8_t g_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y.val[0])); + int16x8_t r_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y.val[0])); + int16x8_t b_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y.val[0])); + int16x8_t g_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y.val[1])); + int16x8_t r_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y.val[1])); + int16x8_t b_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. + * Re-interleave the "even" and "odd" component values. + */ + uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); + uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); + uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba_h; + rgba_h.val[RGB_RED] = r.val[1]; + rgba_h.val[RGB_GREEN] = g.val[1]; + rgba_h.val[RGB_BLUE] = b.val[1]; + /* Set alpha channel to opaque (0xFF). */ + rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + uint8x8x4_t rgba_l; + rgba_l.val[RGB_RED] = r.val[0]; + rgba_l.val[RGB_GREEN] = g.val[0]; + rgba_l.val[RGB_BLUE] = b.val[0]; + /* Set alpha channel to opaque (0xFF). */ + rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 15: + vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0); + FALLTHROUGH /*FALLTHROUGH*/ + case 8: + vst4_u8(outptr, rgba_l); + break; + case 7: + vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst4_lane_u8(outptr, rgba_l, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#else + uint8x8x3_t rgb_h; + rgb_h.val[RGB_RED] = r.val[1]; + rgb_h.val[RGB_GREEN] = g.val[1]; + rgb_h.val[RGB_BLUE] = b.val[1]; + uint8x8x3_t rgb_l; + rgb_l.val[RGB_RED] = r.val[0]; + rgb_l.val[RGB_GREEN] = g.val[0]; + rgb_l.val[RGB_BLUE] = b.val[0]; + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 15: + vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0); + FALLTHROUGH /*FALLTHROUGH*/ + case 8: + vst3_u8(outptr, rgb_l); + break; + case 7: + vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst3_lane_u8(outptr, rgb_l, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#endif + } +} + + +/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. + * + * See comments above for details regarding color conversion and safe memory + * access. + */ + +void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr0, outptr1; + /* Pointers to Y (both rows), Cb, and Cr data */ + JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2; + + const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts); + const int16x8_t neg_128 = vdupq_n_s16(-128); + + inptr0_0 = input_buf[0][in_row_group_ctr * 2]; + inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr0 = output_buf[0]; + outptr1 = output_buf[1]; + + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + /* For each row, de-interleave Y component values into two separate + * vectors, one containing the component values with even-numbered indices + * and one containing the component values with odd-numbered indices. + */ + uint8x8x2_t y0 = vld2_u8(inptr0_0); + uint8x8x2_t y1 = vld2_u8(inptr0_1); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); + /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both + * the "even" and "odd" Y component values. This effectively upsamples the + * chroma components both horizontally and vertically. + */ + int16x8_t g0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y0.val[0])); + int16x8_t r0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y0.val[0])); + int16x8_t b0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y0.val[0])); + int16x8_t g0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y0.val[1])); + int16x8_t r0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y0.val[1])); + int16x8_t b0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y0.val[1])); + int16x8_t g1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y1.val[0])); + int16x8_t r1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y1.val[0])); + int16x8_t b1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y1.val[0])); + int16x8_t g1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y1.val[1])); + int16x8_t r1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y1.val[1])); + int16x8_t b1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y1.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. + * Re-interleave the "even" and "odd" component values. + */ + uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); + uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); + uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); + uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); + uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); + uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); + +#ifdef RGB_ALPHA + uint8x16x4_t rgba0, rgba1; + rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); + rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); + rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); + rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); + rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); + rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); + /* Set alpha channel to opaque (0xFF). */ + rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr0, rgba0); + vst4q_u8(outptr1, rgba1); +#else + uint8x16x3_t rgb0, rgb1; + rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); + rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); + rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); + rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); + rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); + rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr0, rgb0); + vst3q_u8(outptr1, rgb1); +#endif + + /* Increment pointers. */ + inptr0_0 += 16; + inptr0_1 += 16; + inptr1 += 8; + inptr2 += 8; + outptr0 += (RGB_PIXELSIZE * 16); + outptr1 += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining > 0) { + /* For each row, de-interleave Y component values into two separate + * vectors, one containing the component values with even-numbered indices + * and one containing the component values with odd-numbered indices. + */ + uint8x8x2_t y0 = vld2_u8(inptr0_0); + uint8x8x2_t y1 = vld2_u8(inptr0_1); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0); + int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0); + g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1); + g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1); + /* Descale G components: shift right 15, round, and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3); + /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both + * the "even" and "odd" Y component values. This effectively upsamples the + * chroma components both horizontally and vertically. + */ + int16x8_t g0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y0.val[0])); + int16x8_t r0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y0.val[0])); + int16x8_t b0_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y0.val[0])); + int16x8_t g0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y0.val[1])); + int16x8_t r0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y0.val[1])); + int16x8_t b0_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y0.val[1])); + int16x8_t g1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y1.val[0])); + int16x8_t r1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y1.val[0])); + int16x8_t b1_even = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y1.val[0])); + int16x8_t g1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), + y1.val[1])); + int16x8_t r1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), + y1.val[1])); + int16x8_t b1_odd = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), + y1.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. + * Re-interleave the "even" and "odd" component values. + */ + uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); + uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); + uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); + uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); + uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); + uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba0_h, rgba1_h; + rgba0_h.val[RGB_RED] = r0.val[1]; + rgba1_h.val[RGB_RED] = r1.val[1]; + rgba0_h.val[RGB_GREEN] = g0.val[1]; + rgba1_h.val[RGB_GREEN] = g1.val[1]; + rgba0_h.val[RGB_BLUE] = b0.val[1]; + rgba1_h.val[RGB_BLUE] = b1.val[1]; + /* Set alpha channel to opaque (0xFF). */ + rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + + uint8x8x4_t rgba0_l, rgba1_l; + rgba0_l.val[RGB_RED] = r0.val[0]; + rgba1_l.val[RGB_RED] = r1.val[0]; + rgba0_l.val[RGB_GREEN] = g0.val[0]; + rgba1_l.val[RGB_GREEN] = g1.val[0]; + rgba0_l.val[RGB_BLUE] = b0.val[0]; + rgba1_l.val[RGB_BLUE] = b1.val[0]; + /* Set alpha channel to opaque (0xFF). */ + rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 15: + vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6); + vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5); + vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4); + vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3); + vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2); + vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1); + vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0); + vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0); + FALLTHROUGH /*FALLTHROUGH*/ + case 8: + vst4_u8(outptr0, rgba0_l); + vst4_u8(outptr1, rgba1_l); + break; + case 7: + vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6); + vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5); + vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4); + vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3); + vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2); + vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1); + vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst4_lane_u8(outptr0, rgba0_l, 0); + vst4_lane_u8(outptr1, rgba1_l, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#else + uint8x8x3_t rgb0_h, rgb1_h; + rgb0_h.val[RGB_RED] = r0.val[1]; + rgb1_h.val[RGB_RED] = r1.val[1]; + rgb0_h.val[RGB_GREEN] = g0.val[1]; + rgb1_h.val[RGB_GREEN] = g1.val[1]; + rgb0_h.val[RGB_BLUE] = b0.val[1]; + rgb1_h.val[RGB_BLUE] = b1.val[1]; + + uint8x8x3_t rgb0_l, rgb1_l; + rgb0_l.val[RGB_RED] = r0.val[0]; + rgb1_l.val[RGB_RED] = r1.val[0]; + rgb0_l.val[RGB_GREEN] = g0.val[0]; + rgb1_l.val[RGB_GREEN] = g1.val[0]; + rgb0_l.val[RGB_BLUE] = b0.val[0]; + rgb1_l.val[RGB_BLUE] = b1.val[0]; + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 15: + vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6); + vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 14: + vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5); + vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 13: + vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4); + vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 12: + vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3); + vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 11: + vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2); + vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 10: + vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1); + vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 9: + vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0); + vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0); + FALLTHROUGH /*FALLTHROUGH*/ + case 8: + vst3_u8(outptr0, rgb0_l); + vst3_u8(outptr1, rgb1_l); + break; + case 7: + vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6); + vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6); + FALLTHROUGH /*FALLTHROUGH*/ + case 6: + vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5); + vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5); + FALLTHROUGH /*FALLTHROUGH*/ + case 5: + vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4); + vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4); + FALLTHROUGH /*FALLTHROUGH*/ + case 4: + vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3); + vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3); + FALLTHROUGH /*FALLTHROUGH*/ + case 3: + vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2); + vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2); + FALLTHROUGH /*FALLTHROUGH*/ + case 2: + vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1); + vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1); + FALLTHROUGH /*FALLTHROUGH*/ + case 1: + vst3_lane_u8(outptr0, rgb0_l, 0); + vst3_lane_u8(outptr1, rgb1_l, 0); + FALLTHROUGH /*FALLTHROUGH*/ + default: + break; + } +#endif + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c new file mode 100644 index 0000000000..90ec6782c4 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c @@ -0,0 +1,569 @@ +/* + * jdsample-neon.c - upsampling (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include + + +/* The diagram below shows a row of samples produced by h2v1 downsampling. + * + * s0 s1 s2 + * +---------+---------+---------+ + * | | | | + * | p0 p1 | p2 p3 | p4 p5 | + * | | | | + * +---------+---------+---------+ + * + * Samples s0-s2 were created by averaging the original pixel component values + * centered at positions p0-p5 above. To approximate those original pixel + * component values, we proportionally blend the adjacent samples in each row. + * + * An upsampled pixel component value is computed by blending the sample + * containing the pixel center with the nearest neighboring sample, in the + * ratio 3:1. For example: + * p1(upsampled) = 3/4 * s0 + 1/4 * s1 + * p2(upsampled) = 3/4 * s1 + 1/4 * s0 + * When computing the first and last pixel component values in the row, there + * is no adjacent sample to blend, so: + * p0(upsampled) = s0 + * p5(upsampled) = s2 + */ + +void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow; + unsigned colctr; + /* Set up constants. */ + const uint16x8_t one_u16 = vdupq_n_u16(1); + const uint8x8_t three_u8 = vdup_n_u8(3); + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + /* First pixel component value in this row of the original image */ + *outptr = (JSAMPLE)GETJSAMPLE(*inptr); + + /* 3/4 * containing sample + 1/4 * nearest neighboring sample + * For p1: containing sample = s0, nearest neighboring sample = s1 + * For p2: containing sample = s1, nearest neighboring sample = s0 + */ + uint8x16_t s0 = vld1q_u8(inptr); + uint8x16_t s1 = vld1q_u8(inptr + 1); + /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes + * denote low half and high half respectively. + */ + uint16x8_t s1_add_3s0_l = + vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); + uint16x8_t s1_add_3s0_h = + vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); + uint16x8_t s0_add_3s1_l = + vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); + uint16x8_t s0_add_3s1_h = + vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); + /* Add ordered dithering bias to odd pixel values. */ + s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); + s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); + + /* The offset is initially 1, because the first pixel component has already + * been stored. However, in subsequent iterations of the SIMD loop, this + * offset is (2 * colctr - 1) to stay within the bounds of the sample + * buffers without having to resort to a slow scalar tail case for the last + * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" + * in jmemmgr.c for more details. + */ + unsigned outptr_offset = 1; + uint8x16x2_t output_pixels; + + /* We use software pipelining to maximise performance. The code indented + * an extra two spaces begins the next iteration of the loop. + */ + for (colctr = 16; colctr < downsampled_width; colctr += 16) { + + s0 = vld1q_u8(inptr + colctr - 1); + s1 = vld1q_u8(inptr + colctr); + + /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ + output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), + vrshrn_n_u16(s1_add_3s0_h, 2)); + output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), + vshrn_n_u16(s0_add_3s1_h, 2)); + + /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes + * denote low half and high half respectively. + */ + s1_add_3s0_l = + vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); + s1_add_3s0_h = + vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); + s0_add_3s1_l = + vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); + s0_add_3s1_h = + vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); + /* Add ordered dithering bias to odd pixel values. */ + s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); + s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); + + /* Store pixel component values to memory. */ + vst2q_u8(outptr + outptr_offset, output_pixels); + outptr_offset = 2 * colctr - 1; + } + + /* Complete the last iteration of the loop. */ + + /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ + output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), + vrshrn_n_u16(s1_add_3s0_h, 2)); + output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), + vshrn_n_u16(s0_add_3s1_h, 2)); + /* Store pixel component values to memory. */ + vst2q_u8(outptr + outptr_offset, output_pixels); + + /* Last pixel component value in this row of the original image */ + outptr[2 * downsampled_width - 1] = + GETJSAMPLE(inptr[downsampled_width - 1]); + } +} + + +/* The diagram below shows an array of samples produced by h2v2 downsampling. + * + * s0 s1 s2 + * +---------+---------+---------+ + * | p0 p1 | p2 p3 | p4 p5 | + * sA | | | | + * | p6 p7 | p8 p9 | p10 p11| + * +---------+---------+---------+ + * | p12 p13| p14 p15| p16 p17| + * sB | | | | + * | p18 p19| p20 p21| p22 p23| + * +---------+---------+---------+ + * | p24 p25| p26 p27| p28 p29| + * sC | | | | + * | p30 p31| p32 p33| p34 p35| + * +---------+---------+---------+ + * + * Samples s0A-s2C were created by averaging the original pixel component + * values centered at positions p0-p35 above. To approximate one of those + * original pixel component values, we proportionally blend the sample + * containing the pixel center with the nearest neighboring samples in each + * row, column, and diagonal. + * + * An upsampled pixel component value is computed by first blending the sample + * containing the pixel center with the nearest neighboring samples in the + * same column, in the ratio 3:1, and then blending each column sum with the + * nearest neighboring column sum, in the ratio 3:1. For example: + * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) + + * 1/4 * (3/4 * s0B + 1/4 * s0A) + * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A + * When computing the first and last pixel component values in the row, there + * is no horizontally adjacent sample to blend, so: + * p12(upsampled) = 3/4 * s0B + 1/4 * s0A + * p23(upsampled) = 3/4 * s2B + 1/4 * s2C + * When computing the first and last pixel component values in the column, + * there is no vertically adjacent sample to blend, so: + * p2(upsampled) = 3/4 * s1A + 1/4 * s0A + * p33(upsampled) = 3/4 * s1C + 1/4 * s2C + * When computing the corner pixel component values, there is no adjacent + * sample to blend, so: + * p0(upsampled) = s0A + * p35(upsampled) = s2C + */ + +void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + unsigned colctr; + /* Set up constants. */ + const uint16x8_t seven_u16 = vdupq_n_u16(7); + const uint8x8_t three_u8 = vdup_n_u8(3); + const uint16x8_t three_u16 = vdupq_n_u16(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, + * respectively. + */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + /* First pixel component value in this row of the original image */ + int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0); + *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); + int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2); + *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4); + + /* Step 1: Blend samples vertically in columns s0 and s1. + * Leave the divide by 4 until the end, when it can be done for both + * dimensions at once, right-shifting by 4. + */ + + /* Load and compute s0colsum0 and s0colsum1. */ + uint8x16_t s0A = vld1q_u8(inptr0); + uint8x16_t s0B = vld1q_u8(inptr1); + uint8x16_t s0C = vld1q_u8(inptr2); + /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes + * denote low half and high half respectively. + */ + uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), + vget_low_u8(s0B), three_u8); + uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), + vget_high_u8(s0B), three_u8); + uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), + vget_low_u8(s0B), three_u8); + uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), + vget_high_u8(s0B), three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + uint8x16_t s1A = vld1q_u8(inptr0 + 1); + uint8x16_t s1B = vld1q_u8(inptr1 + 1); + uint8x16_t s1C = vld1q_u8(inptr2 + 1); + uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), + vget_low_u8(s1B), three_u8); + uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), + vget_high_u8(s1B), three_u8); + uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), + vget_low_u8(s1B), three_u8); + uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), + vget_high_u8(s1B), three_u8); + + /* Step 2: Blend the already-blended columns. */ + + uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ + uint8x16x2_t output_pixels0 = { { + vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4)) + } }; + uint8x16x2_t output_pixels1 = { { + vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4)) + } }; + + /* Store pixel component values to memory. + * The minimum size of the output buffer for each row is 64 bytes => no + * need to worry about buffer overflow here. See "Creation of 2-D sample + * arrays" in jmemmgr.c for more details. + */ + vst2q_u8(outptr0 + 1, output_pixels0); + vst2q_u8(outptr1 + 1, output_pixels1); + + /* The first pixel of the image shifted our loads and stores by one byte. + * We have to re-align on a 32-byte boundary at some point before the end + * of the row (we do it now on the 32/33 pixel boundary) to stay within the + * bounds of the sample buffers without having to resort to a slow scalar + * tail case for the last (downsampled_width % 16) samples. See "Creation + * of 2-D sample arrays" in jmemmgr.c for more details. + */ + for (colctr = 16; colctr < downsampled_width; colctr += 16) { + /* Step 1: Blend samples vertically in columns s0 and s1. */ + + /* Load and compute s0colsum0 and s0colsum1. */ + s0A = vld1q_u8(inptr0 + colctr - 1); + s0B = vld1q_u8(inptr1 + colctr - 1); + s0C = vld1q_u8(inptr2 + colctr - 1); + s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B), + three_u8); + s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B), + three_u8); + s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B), + three_u8); + s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B), + three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + s1A = vld1q_u8(inptr0 + colctr); + s1B = vld1q_u8(inptr1 + colctr); + s1C = vld1q_u8(inptr2 + colctr); + s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B), + three_u8); + s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B), + three_u8); + s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B), + three_u8); + s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B), + three_u8); + + /* Step 2: Blend the already-blended columns. */ + + output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ + output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4), + vshrn_n_u16(output0_p1_h, 4)); + output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), + vrshrn_n_u16(output0_p2_h, 4)); + output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4), + vshrn_n_u16(output1_p1_h, 4)); + output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), + vrshrn_n_u16(output1_p2_h, 4)); + /* Store pixel component values to memory. */ + vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0); + vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1); + } + + /* Last pixel component value in this row of the original image */ + int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr0[downsampled_width - 1]); + outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); + int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr2[downsampled_width - 1]); + outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4); + inrow++; + } +} + + +/* The diagram below shows a column of samples produced by h1v2 downsampling + * (or by losslessly rotating or transposing an h2v1-downsampled image.) + * + * +---------+ + * | p0 | + * sA | | + * | p1 | + * +---------+ + * | p2 | + * sB | | + * | p3 | + * +---------+ + * | p4 | + * sC | | + * | p5 | + * +---------+ + * + * Samples sA-sC were created by averaging the original pixel component values + * centered at positions p0-p5 above. To approximate those original pixel + * component values, we proportionally blend the adjacent samples in each + * column. + * + * An upsampled pixel component value is computed by blending the sample + * containing the pixel center with the nearest neighboring sample, in the + * ratio 3:1. For example: + * p1(upsampled) = 3/4 * sA + 1/4 * sB + * p2(upsampled) = 3/4 * sB + 1/4 * sA + * When computing the first and last pixel component values in the column, + * there is no adjacent sample to blend, so: + * p0(upsampled) = sA + * p5(upsampled) = sC + */ + +void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + unsigned colctr; + /* Set up constants. */ + const uint16x8_t one_u16 = vdupq_n_u16(1); + const uint8x8_t three_u8 = vdup_n_u8(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, + * respectively. + */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + inrow++; + + /* The size of the input and output buffers is always a multiple of 32 + * bytes => no need to worry about buffer overflow when reading/writing + * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more + * details. + */ + for (colctr = 0; colctr < downsampled_width; colctr += 16) { + /* Load samples. */ + uint8x16_t sA = vld1q_u8(inptr0 + colctr); + uint8x16_t sB = vld1q_u8(inptr1 + colctr); + uint8x16_t sC = vld1q_u8(inptr2 + colctr); + /* Blend samples vertically. */ + uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)), + vget_low_u8(sB), three_u8); + uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)), + vget_high_u8(sB), three_u8); + uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)), + vget_low_u8(sB), three_u8); + uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)), + vget_high_u8(sB), three_u8); + /* Add ordered dithering bias to pixel values in even output rows. */ + colsum0_l = vaddq_u16(colsum0_l, one_u16); + colsum0_h = vaddq_u16(colsum0_h, one_u16); + /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ + uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2), + vshrn_n_u16(colsum0_h, 2)); + uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2), + vrshrn_n_u16(colsum1_h, 2)); + /* Store pixel component values to memory. */ + vst1q_u8(outptr0 + colctr, output_pixels0); + vst1q_u8(outptr1 + colctr, output_pixels1); + } + } +} + + +/* The diagram below shows a row of samples produced by h2v1 downsampling. + * + * s0 s1 + * +---------+---------+ + * | | | + * | p0 p1 | p2 p3 | + * | | | + * +---------+---------+ + * + * Samples s0 and s1 were created by averaging the original pixel component + * values centered at positions p0-p3 above. To approximate those original + * pixel component values, we duplicate the samples horizontally: + * p0(upsampled) = p1(upsampled) = s0 + * p2(upsampled) = p3(upsampled) = s1 + */ + +void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow; + unsigned colctr; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + for (colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples. The store operation below interleaves them so + * that adjacent pixel component values take on the same sample value, + * per above. + */ + uint8x16x2_t output_pixels = { { samples, samples } }; + /* Store pixel component values to memory. + * Due to the way sample buffers are allocated, we don't need to worry + * about tail cases when output_width is not a multiple of 32. See + * "Creation of 2-D sample arrays" in jmemmgr.c for details. + */ + vst2q_u8(outptr + 2 * colctr, output_pixels); + } + } +} + + +/* The diagram below shows an array of samples produced by h2v2 downsampling. + * + * s0 s1 + * +---------+---------+ + * | p0 p1 | p2 p3 | + * sA | | | + * | p4 p5 | p6 p7 | + * +---------+---------+ + * | p8 p9 | p10 p11| + * sB | | | + * | p12 p13| p14 p15| + * +---------+---------+ + * + * Samples s0A-s1B were created by averaging the original pixel component + * values centered at positions p0-p15 above. To approximate those original + * pixel component values, we duplicate the samples both horizontally and + * vertically: + * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A + * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A + * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B + * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B + */ + +void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow; + unsigned colctr; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples. The store operation below interleaves them so + * that adjacent pixel component values take on the same sample value, + * per above. + */ + uint8x16x2_t output_pixels = { { samples, samples } }; + /* Store pixel component values for both output rows to memory. + * Due to the way sample buffers are allocated, we don't need to worry + * about tail cases when output_width is not a multiple of 32. See + * "Creation of 2-D sample arrays" in jmemmgr.c for details. + */ + vst2q_u8(outptr0 + 2 * colctr, output_pixels); + vst2q_u8(outptr1 + 2 * colctr, output_pixels); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c new file mode 100644 index 0000000000..bb371be399 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c @@ -0,0 +1,214 @@ +/* + * jfdctfst-neon.c - fast integer FDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT + * (Discrete Cosine Transform) on one block of samples. It uses the same + * calculations and produces exactly the same output as IJG's original + * jpeg_fdct_ifast() function, which can be found in jfdctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.382683433 = 12544 * 2^-15 + * 0.541196100 = 17795 * 2^-15 + * 0.707106781 = 23168 * 2^-15 + * 0.306562965 = 9984 * 2^-15 + * + * See jfdctfst.c for further details of the DCT algorithm. Where possible, + * the variable names and comments here in jsimd_fdct_ifast_neon() match up + * with those in jpeg_fdct_ifast(). + */ + +#define F_0_382 12544 +#define F_0_541 17792 +#define F_0_707 23168 +#define F_0_306 9984 + + +ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = { + F_0_382, F_0_541, F_0_707, F_0_306 +}; + +void jsimd_fdct_ifast_neon(DCTELEM *data) +{ + /* Load an 8x8 block of samples into Neon registers. De-interleaving loads + * are used, followed by vuzp to transpose the block such that we have a + * column of samples per vector - allowing all rows to be processed at once. + */ + int16x8x4_t data1 = vld4q_s16(data); + int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE); + + int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]); + int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]); + int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]); + int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]); + + int16x8_t col0 = cols_04.val[0]; + int16x8_t col1 = cols_15.val[0]; + int16x8_t col2 = cols_26.val[0]; + int16x8_t col3 = cols_37.val[0]; + int16x8_t col4 = cols_04.val[1]; + int16x8_t col5 = cols_15.val[1]; + int16x8_t col6 = cols_26.val[1]; + int16x8_t col7 = cols_37.val[1]; + + /* Pass 1: process rows. */ + + /* Load DCT conversion constants. */ + const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts); + + int16x8_t tmp0 = vaddq_s16(col0, col7); + int16x8_t tmp7 = vsubq_s16(col0, col7); + int16x8_t tmp1 = vaddq_s16(col1, col6); + int16x8_t tmp6 = vsubq_s16(col1, col6); + int16x8_t tmp2 = vaddq_s16(col2, col5); + int16x8_t tmp5 = vsubq_s16(col2, col5); + int16x8_t tmp3 = vaddq_s16(col3, col4); + int16x8_t tmp4 = vsubq_s16(col3, col4); + + /* Even part */ + int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); + int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); + int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); + + col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + col4 = vsubq_s16(tmp10, tmp11); + + int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + col2 = vaddq_s16(tmp13, z1); /* phase 5 */ + col6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + int16x8_t z13 = vsubq_s16(tmp7, z3); + + col5 = vaddq_s16(z13, z2); /* phase 6 */ + col3 = vsubq_s16(z13, z2); + col1 = vaddq_s16(z11, z4); + col7 = vsubq_s16(z11, z4); + + /* Transpose to work on columns in pass 2. */ + int16x8x2_t cols_01 = vtrnq_s16(col0, col1); + int16x8x2_t cols_23 = vtrnq_s16(col2, col3); + int16x8x2_t cols_45 = vtrnq_s16(col4, col5); + int16x8x2_t cols_67 = vtrnq_s16(col6, col7); + + int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), + vreinterpretq_s32_s16(cols_45.val[0])); + int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), + vreinterpretq_s32_s16(cols_45.val[1])); + int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), + vreinterpretq_s32_s16(cols_67.val[0])); + int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), + vreinterpretq_s32_s16(cols_67.val[1])); + + int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); + int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); + int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); + int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); + + int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); + int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); + int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); + int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); + int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); + int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); + int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); + int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); + + /* Pass 2: process columns. */ + + tmp0 = vaddq_s16(row0, row7); + tmp7 = vsubq_s16(row0, row7); + tmp1 = vaddq_s16(row1, row6); + tmp6 = vsubq_s16(row1, row6); + tmp2 = vaddq_s16(row2, row5); + tmp5 = vsubq_s16(row2, row5); + tmp3 = vaddq_s16(row3, row4); + tmp4 = vsubq_s16(row3, row4); + + /* Even part */ + tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + tmp13 = vsubq_s16(tmp0, tmp3); + tmp11 = vaddq_s16(tmp1, tmp2); + tmp12 = vsubq_s16(tmp1, tmp2); + + row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + row4 = vsubq_s16(tmp10, tmp11); + + z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + row2 = vaddq_s16(tmp13, z1); /* phase 5 */ + row6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + z13 = vsubq_s16(tmp7, z3); + + row5 = vaddq_s16(z13, z2); /* phase 6 */ + row3 = vsubq_s16(z13, z2); + row1 = vaddq_s16(z11, z4); + row7 = vsubq_s16(z11, z4); + + vst1q_s16(data + 0 * DCTSIZE, row0); + vst1q_s16(data + 1 * DCTSIZE, row1); + vst1q_s16(data + 2 * DCTSIZE, row2); + vst1q_s16(data + 3 * DCTSIZE, row3); + vst1q_s16(data + 4 * DCTSIZE, row4); + vst1q_s16(data + 5 * DCTSIZE, row5); + vst1q_s16(data + 6 * DCTSIZE, row6); + vst1q_s16(data + 7 * DCTSIZE, row7); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c new file mode 100644 index 0000000000..ccfc07b15d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c @@ -0,0 +1,376 @@ +/* + * jfdctint-neon.c - accurate integer FDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" +#include "neon-compat.h" + +#include + + +/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT + * (Discrete Cosine Transform) on one block of samples. It uses the same + * calculations and produces exactly the same output as IJG's original + * jpeg_fdct_islow() function, which can be found in jfdctint.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.298631336 = 2446 * 2^-13 + * 0.390180644 = 3196 * 2^-13 + * 0.541196100 = 4433 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.175875602 = 9633 * 2^-13 + * 1.501321110 = 12299 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 1.961570560 = 16069 * 2^-13 + * 2.053119869 = 16819 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * 3.072711026 = 25172 * 2^-13 + * + * See jfdctint.c for further details of the DCT algorithm. Where possible, + * the variable names and comments here in jsimd_fdct_islow_neon() match up + * with those in jpeg_fdct_islow(). + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define F_0_298 2446 +#define F_0_390 3196 +#define F_0_541 4433 +#define F_0_765 6270 +#define F_0_899 7373 +#define F_1_175 9633 +#define F_1_501 12299 +#define F_1_847 15137 +#define F_1_961 16069 +#define F_2_053 16819 +#define F_2_562 20995 +#define F_3_072 25172 + + +ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = { + F_0_298, -F_0_390, F_0_541, F_0_765, + -F_0_899, F_1_175, F_1_501, -F_1_847, + -F_1_961, F_2_053, -F_2_562, F_3_072 +}; + +void jsimd_fdct_islow_neon(DCTELEM *data) +{ + /* Load DCT constants. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts); +#else + /* GCC does not currently support the intrinsic vld1__x3(). */ + const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + /* Load an 8x8 block of samples into Neon registers. De-interleaving loads + * are used, followed by vuzp to transpose the block such that we have a + * column of samples per vector - allowing all rows to be processed at once. + */ + int16x8x4_t s_rows_0123 = vld4q_s16(data); + int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE); + + int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]); + int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]); + int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]); + int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]); + + int16x8_t col0 = cols_04.val[0]; + int16x8_t col1 = cols_15.val[0]; + int16x8_t col2 = cols_26.val[0]; + int16x8_t col3 = cols_37.val[0]; + int16x8_t col4 = cols_04.val[1]; + int16x8_t col5 = cols_15.val[1]; + int16x8_t col6 = cols_26.val[1]; + int16x8_t col7 = cols_37.val[1]; + + /* Pass 1: process rows. */ + + int16x8_t tmp0 = vaddq_s16(col0, col7); + int16x8_t tmp7 = vsubq_s16(col0, col7); + int16x8_t tmp1 = vaddq_s16(col1, col6); + int16x8_t tmp6 = vsubq_s16(col1, col6); + int16x8_t tmp2 = vaddq_s16(col2, col5); + int16x8_t tmp5 = vsubq_s16(col2, col5); + int16x8_t tmp3 = vaddq_s16(col3, col4); + int16x8_t tmp4 = vsubq_s16(col3, col4); + + /* Even part */ + int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); + int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); + int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); + int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); + + col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); + col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); + + int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); + int32x4_t z1_l = + vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); + int32x4_t z1_h = + vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2); + + int32x4_t col2_scaled_l = + vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3); + int32x4_t col2_scaled_h = + vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3); + col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1), + vrshrn_n_s32(col2_scaled_h, DESCALE_P1)); + + int32x4_t col6_scaled_l = + vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3); + int32x4_t col6_scaled_h = + vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3); + col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1), + vrshrn_n_s32(col6_scaled_h, DESCALE_P1)); + + /* Odd part */ + int16x8_t z1 = vaddq_s16(tmp4, tmp7); + int16x8_t z2 = vaddq_s16(tmp5, tmp6); + int16x8_t z3 = vaddq_s16(tmp4, tmp6); + int16x8_t z4 = vaddq_s16(tmp5, tmp7); + /* sqrt(2) * c3 */ + int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); + int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); + z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); + z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); + + /* sqrt(2) * (-c1+c3+c5-c7) */ + int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); + int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); + /* sqrt(2) * ( c1+c3-c5+c7) */ + int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); + int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); + /* sqrt(2) * ( c1+c3+c5-c7) */ + int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); + int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); + /* sqrt(2) * ( c1+c3-c5-c7) */ + int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); + int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); + + /* sqrt(2) * (c7-c3) */ + z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); + z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); + /* sqrt(2) * (-c1-c3) */ + int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); + int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); + /* sqrt(2) * (-c3-c5) */ + int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); + int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); + /* sqrt(2) * (c5-c3) */ + int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); + int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); + + z3_l = vaddq_s32(z3_l, z5_l); + z3_h = vaddq_s32(z3_h, z5_h); + z4_l = vaddq_s32(z4_l, z5_l); + z4_h = vaddq_s32(z4_h, z5_h); + + tmp4_l = vaddq_s32(tmp4_l, z1_l); + tmp4_h = vaddq_s32(tmp4_h, z1_h); + tmp4_l = vaddq_s32(tmp4_l, z3_l); + tmp4_h = vaddq_s32(tmp4_h, z3_h); + col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1), + vrshrn_n_s32(tmp4_h, DESCALE_P1)); + + tmp5_l = vaddq_s32(tmp5_l, z2_l); + tmp5_h = vaddq_s32(tmp5_h, z2_h); + tmp5_l = vaddq_s32(tmp5_l, z4_l); + tmp5_h = vaddq_s32(tmp5_h, z4_h); + col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1), + vrshrn_n_s32(tmp5_h, DESCALE_P1)); + + tmp6_l = vaddq_s32(tmp6_l, z2_l); + tmp6_h = vaddq_s32(tmp6_h, z2_h); + tmp6_l = vaddq_s32(tmp6_l, z3_l); + tmp6_h = vaddq_s32(tmp6_h, z3_h); + col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1), + vrshrn_n_s32(tmp6_h, DESCALE_P1)); + + tmp7_l = vaddq_s32(tmp7_l, z1_l); + tmp7_h = vaddq_s32(tmp7_h, z1_h); + tmp7_l = vaddq_s32(tmp7_l, z4_l); + tmp7_h = vaddq_s32(tmp7_h, z4_h); + col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1), + vrshrn_n_s32(tmp7_h, DESCALE_P1)); + + /* Transpose to work on columns in pass 2. */ + int16x8x2_t cols_01 = vtrnq_s16(col0, col1); + int16x8x2_t cols_23 = vtrnq_s16(col2, col3); + int16x8x2_t cols_45 = vtrnq_s16(col4, col5); + int16x8x2_t cols_67 = vtrnq_s16(col6, col7); + + int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), + vreinterpretq_s32_s16(cols_45.val[0])); + int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), + vreinterpretq_s32_s16(cols_45.val[1])); + int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), + vreinterpretq_s32_s16(cols_67.val[0])); + int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), + vreinterpretq_s32_s16(cols_67.val[1])); + + int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); + int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); + int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); + int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); + + int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); + int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); + int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); + int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); + int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); + int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); + int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); + int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); + + /* Pass 2: process columns. */ + + tmp0 = vaddq_s16(row0, row7); + tmp7 = vsubq_s16(row0, row7); + tmp1 = vaddq_s16(row1, row6); + tmp6 = vsubq_s16(row1, row6); + tmp2 = vaddq_s16(row2, row5); + tmp5 = vsubq_s16(row2, row5); + tmp3 = vaddq_s16(row3, row4); + tmp4 = vsubq_s16(row3, row4); + + /* Even part */ + tmp10 = vaddq_s16(tmp0, tmp3); + tmp13 = vsubq_s16(tmp0, tmp3); + tmp11 = vaddq_s16(tmp1, tmp2); + tmp12 = vsubq_s16(tmp1, tmp2); + + row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); + row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); + + tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); + z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); + z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2); + + int32x4_t row2_scaled_l = + vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3); + int32x4_t row2_scaled_h = + vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3); + row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2), + vrshrn_n_s32(row2_scaled_h, DESCALE_P2)); + + int32x4_t row6_scaled_l = + vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3); + int32x4_t row6_scaled_h = + vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3); + row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2), + vrshrn_n_s32(row6_scaled_h, DESCALE_P2)); + + /* Odd part */ + z1 = vaddq_s16(tmp4, tmp7); + z2 = vaddq_s16(tmp5, tmp6); + z3 = vaddq_s16(tmp4, tmp6); + z4 = vaddq_s16(tmp5, tmp7); + /* sqrt(2) * c3 */ + z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); + z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); + z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); + z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); + + /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); + tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); + /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); + tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); + /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); + tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); + /* sqrt(2) * ( c1+c3-c5-c7) */ + tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); + tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); + + /* sqrt(2) * (c7-c3) */ + z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); + z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); + /* sqrt(2) * (-c1-c3) */ + z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); + z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); + /* sqrt(2) * (-c3-c5) */ + z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); + z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); + /* sqrt(2) * (c5-c3) */ + z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); + z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); + + z3_l = vaddq_s32(z3_l, z5_l); + z3_h = vaddq_s32(z3_h, z5_h); + z4_l = vaddq_s32(z4_l, z5_l); + z4_h = vaddq_s32(z4_h, z5_h); + + tmp4_l = vaddq_s32(tmp4_l, z1_l); + tmp4_h = vaddq_s32(tmp4_h, z1_h); + tmp4_l = vaddq_s32(tmp4_l, z3_l); + tmp4_h = vaddq_s32(tmp4_h, z3_h); + row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2), + vrshrn_n_s32(tmp4_h, DESCALE_P2)); + + tmp5_l = vaddq_s32(tmp5_l, z2_l); + tmp5_h = vaddq_s32(tmp5_h, z2_h); + tmp5_l = vaddq_s32(tmp5_l, z4_l); + tmp5_h = vaddq_s32(tmp5_h, z4_h); + row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2), + vrshrn_n_s32(tmp5_h, DESCALE_P2)); + + tmp6_l = vaddq_s32(tmp6_l, z2_l); + tmp6_h = vaddq_s32(tmp6_h, z2_h); + tmp6_l = vaddq_s32(tmp6_l, z3_l); + tmp6_h = vaddq_s32(tmp6_h, z3_h); + row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2), + vrshrn_n_s32(tmp6_h, DESCALE_P2)); + + tmp7_l = vaddq_s32(tmp7_l, z1_l); + tmp7_h = vaddq_s32(tmp7_h, z1_h); + tmp7_l = vaddq_s32(tmp7_l, z4_l); + tmp7_h = vaddq_s32(tmp7_h, z4_h); + row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2), + vrshrn_n_s32(tmp7_h, DESCALE_P2)); + + vst1q_s16(data + 0 * DCTSIZE, row0); + vst1q_s16(data + 1 * DCTSIZE, row1); + vst1q_s16(data + 2 * DCTSIZE, row2); + vst1q_s16(data + 3 * DCTSIZE, row3); + vst1q_s16(data + 4 * DCTSIZE, row4); + vst1q_s16(data + 5 * DCTSIZE, row5); + vst1q_s16(data + 6 * DCTSIZE, row6); + vst1q_s16(data + 7 * DCTSIZE, row7); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c new file mode 100644 index 0000000000..a91be5362e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c @@ -0,0 +1,472 @@ +/* + * jidctfst-neon.c - fast integer IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" + +#include + + +/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate + * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It + * uses the same calculations and produces exactly the same output as IJG's + * original jpeg_idct_ifast() function, which can be found in jidctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.082392200 = 2688 * 2^-15 + * 0.414213562 = 13568 * 2^-15 + * 0.847759065 = 27776 * 2^-15 + * 0.613125930 = 20096 * 2^-15 + * + * See jidctfst.c for further details of the IDCT algorithm. Where possible, + * the variable names and comments here in jsimd_idct_ifast_neon() match up + * with those in jpeg_idct_ifast(). + */ + +#define PASS1_BITS 2 + +#define F_0_082 2688 +#define F_0_414 13568 +#define F_0_847 27776 +#define F_0_613 20096 + + +ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = { + F_0_082, F_0_414, F_0_847, F_0_613 +}; + +void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + IFAST_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table values for DC coefficients. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + /* Dequantize DC coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + + /* Construct bitmap to test if all AC coefficients are 0. */ + int16x8_t bitmap = vorrq_s16(row1, row2); + bitmap = vorrq_s16(bitmap, row3); + bitmap = vorrq_s16(bitmap, row4); + bitmap = vorrq_s16(bitmap, row5); + bitmap = vorrq_s16(bitmap, row6); + bitmap = vorrq_s16(bitmap, row7); + + int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); + int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); + + /* Load IDCT conversion constants. */ + const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts); + + if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { + /* All AC coefficients are zero. + * Compute DC values and duplicate into vectors. + */ + int16x8_t dcval = row0; + row1 = dcval; + row2 = dcval; + row3 = dcval; + row4 = dcval; + row5 = dcval; + row6 = dcval; + row7 = dcval; + } else if (left_ac_bitmap == 0) { + /* AC coefficients are zero for columns 0, 1, 2, and 3. + * Use DC values for these columns. + */ + int16x4_t dcval = vget_low_s16(row0); + + /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */ + + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Even part: dequantize DCT coefficients. */ + int16x4_t tmp0 = vget_high_s16(row0); + int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2); + int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4); + int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6); + + int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ + int16x4_t tmp11 = vsub_s16(tmp0, tmp2); + + int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ + int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); + int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); + tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsub_s16(tmp12, tmp13); + + tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsub_s16(tmp10, tmp13); + tmp1 = vadd_s16(tmp11, tmp12); + tmp2 = vsub_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1); + int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3); + int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5); + int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7); + + int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ + int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); + int16x4_t z11 = vadd_s16(tmp4, tmp7); + int16x4_t z12 = vsub_s16(tmp4, tmp7); + + tmp7 = vadd_s16(z11, z13); /* phase 5 */ + int16x4_t z11_sub_z13 = vsub_s16(z11, z13); + tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); + tmp11 = vadd_s16(tmp11, z11_sub_z13); + + int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); + int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); + z5 = vadd_s16(z5, z10_add_z12); + tmp10 = vqdmulh_lane_s16(z12, consts, 0); + tmp10 = vadd_s16(tmp10, z12); + tmp10 = vsub_s16(tmp10, z5); + tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); + tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); + tmp12 = vadd_s16(tmp12, z5); + + tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsub_s16(tmp11, tmp6); + tmp4 = vadd_s16(tmp10, tmp5); + + row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7)); + row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7)); + row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6)); + row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6)); + row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5)); + row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5)); + row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4)); + row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4)); + } else if (right_ac_bitmap == 0) { + /* AC coefficients are zero for columns 4, 5, 6, and 7. + * Use DC values for these columns. + */ + int16x4_t dcval = vget_high_s16(row0); + + /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */ + + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Even part: dequantize DCT coefficients. */ + int16x4_t tmp0 = vget_low_s16(row0); + int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2); + int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4); + int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6); + + int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ + int16x4_t tmp11 = vsub_s16(tmp0, tmp2); + + int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ + int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); + int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); + tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsub_s16(tmp12, tmp13); + + tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsub_s16(tmp10, tmp13); + tmp1 = vadd_s16(tmp11, tmp12); + tmp2 = vsub_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1); + int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3); + int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5); + int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7); + + int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ + int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); + int16x4_t z11 = vadd_s16(tmp4, tmp7); + int16x4_t z12 = vsub_s16(tmp4, tmp7); + + tmp7 = vadd_s16(z11, z13); /* phase 5 */ + int16x4_t z11_sub_z13 = vsub_s16(z11, z13); + tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); + tmp11 = vadd_s16(tmp11, z11_sub_z13); + + int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); + int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); + z5 = vadd_s16(z5, z10_add_z12); + tmp10 = vqdmulh_lane_s16(z12, consts, 0); + tmp10 = vadd_s16(tmp10, z12); + tmp10 = vsub_s16(tmp10, z5); + tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); + tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); + tmp12 = vadd_s16(tmp12, z5); + + tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsub_s16(tmp11, tmp6); + tmp4 = vadd_s16(tmp10, tmp5); + + row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval); + row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval); + row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval); + row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval); + row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval); + row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval); + row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval); + row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval); + } else { + /* Some AC coefficients are non-zero; full IDCT calculation required. */ + + /* Load quantization table. */ + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Even part: dequantize DCT coefficients. */ + int16x8_t tmp0 = row0; + int16x8_t tmp1 = vmulq_s16(row2, quant_row2); + int16x8_t tmp2 = vmulq_s16(row4, quant_row4); + int16x8_t tmp3 = vmulq_s16(row6, quant_row6); + + int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */ + int16x8_t tmp11 = vsubq_s16(tmp0, tmp2); + + int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */ + int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3); + int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1); + tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsubq_s16(tmp12, tmp13); + + tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsubq_s16(tmp10, tmp13); + tmp1 = vaddq_s16(tmp11, tmp12); + tmp2 = vsubq_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x8_t tmp4 = vmulq_s16(row1, quant_row1); + int16x8_t tmp5 = vmulq_s16(row3, quant_row3); + int16x8_t tmp6 = vmulq_s16(row5, quant_row5); + int16x8_t tmp7 = vmulq_s16(row7, quant_row7); + + int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */ + int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6); + int16x8_t z11 = vaddq_s16(tmp4, tmp7); + int16x8_t z12 = vsubq_s16(tmp4, tmp7); + + tmp7 = vaddq_s16(z11, z13); /* phase 5 */ + int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); + tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); + tmp11 = vaddq_s16(tmp11, z11_sub_z13); + + int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); + int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); + z5 = vaddq_s16(z5, z10_add_z12); + tmp10 = vqdmulhq_lane_s16(z12, consts, 0); + tmp10 = vaddq_s16(tmp10, z12); + tmp10 = vsubq_s16(tmp10, z5); + tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); + tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); + tmp12 = vaddq_s16(tmp12, z5); + + tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsubq_s16(tmp11, tmp6); + tmp4 = vaddq_s16(tmp10, tmp5); + + row0 = vaddq_s16(tmp0, tmp7); + row7 = vsubq_s16(tmp0, tmp7); + row1 = vaddq_s16(tmp1, tmp6); + row6 = vsubq_s16(tmp1, tmp6); + row2 = vaddq_s16(tmp2, tmp5); + row5 = vsubq_s16(tmp2, tmp5); + row4 = vaddq_s16(tmp3, tmp4); + row3 = vsubq_s16(tmp3, tmp4); + } + + /* Transpose rows to work on columns in pass 2. */ + int16x8x2_t rows_01 = vtrnq_s16(row0, row1); + int16x8x2_t rows_23 = vtrnq_s16(row2, row3); + int16x8x2_t rows_45 = vtrnq_s16(row4, row5); + int16x8x2_t rows_67 = vtrnq_s16(row6, row7); + + int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]), + vreinterpretq_s32_s16(rows_45.val[0])); + int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]), + vreinterpretq_s32_s16(rows_45.val[1])); + int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]), + vreinterpretq_s32_s16(rows_67.val[0])); + int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]), + vreinterpretq_s32_s16(rows_67.val[1])); + + int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]); + int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]); + int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]); + int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]); + + int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]); + int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]); + int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]); + int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]); + int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]); + int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]); + int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]); + int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]); + + /* 1-D IDCT, pass 2 */ + + /* Even part */ + int16x8_t tmp10 = vaddq_s16(col0, col4); + int16x8_t tmp11 = vsubq_s16(col0, col4); + + int16x8_t tmp13 = vaddq_s16(col2, col6); + int16x8_t col2_sub_col6 = vsubq_s16(col2, col6); + int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1); + tmp12 = vaddq_s16(tmp12, col2_sub_col6); + tmp12 = vsubq_s16(tmp12, tmp13); + + int16x8_t tmp0 = vaddq_s16(tmp10, tmp13); + int16x8_t tmp3 = vsubq_s16(tmp10, tmp13); + int16x8_t tmp1 = vaddq_s16(tmp11, tmp12); + int16x8_t tmp2 = vsubq_s16(tmp11, tmp12); + + /* Odd part */ + int16x8_t z13 = vaddq_s16(col5, col3); + int16x8_t neg_z10 = vsubq_s16(col3, col5); + int16x8_t z11 = vaddq_s16(col1, col7); + int16x8_t z12 = vsubq_s16(col1, col7); + + int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */ + int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); + tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); + tmp11 = vaddq_s16(tmp11, z11_sub_z13); + + int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); + int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); + z5 = vaddq_s16(z5, z10_add_z12); + tmp10 = vqdmulhq_lane_s16(z12, consts, 0); + tmp10 = vaddq_s16(tmp10, z12); + tmp10 = vsubq_s16(tmp10, z5); + tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); + tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); + tmp12 = vaddq_s16(tmp12, z5); + + int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ + int16x8_t tmp5 = vsubq_s16(tmp11, tmp6); + int16x8_t tmp4 = vaddq_s16(tmp10, tmp5); + + col0 = vaddq_s16(tmp0, tmp7); + col7 = vsubq_s16(tmp0, tmp7); + col1 = vaddq_s16(tmp1, tmp6); + col6 = vsubq_s16(tmp1, tmp6); + col2 = vaddq_s16(tmp2, tmp5); + col5 = vsubq_s16(tmp2, tmp5); + col4 = vaddq_s16(tmp3, tmp4); + col3 = vsubq_s16(tmp3, tmp4); + + /* Scale down by a factor of 8, narrowing to 8-bit. */ + int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3), + vqshrn_n_s16(col1, PASS1_BITS + 3)); + int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3), + vqshrn_n_s16(col5, PASS1_BITS + 3)); + int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3), + vqshrn_n_s16(col3, PASS1_BITS + 3)); + int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3), + vqshrn_n_s16(col7, PASS1_BITS + 3)); + /* Clamp to range [0-255]. */ + uint8x16_t cols_01 = + vreinterpretq_u8_s8 + (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_45 = + vreinterpretq_u8_s8 + (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_23 = + vreinterpretq_u8_s8 + (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_67 = + vreinterpretq_u8_s8 + (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + + /* Transpose block to prepare for store. */ + uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01), + vreinterpretq_u32_u8(cols_45)); + uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23), + vreinterpretq_u32_u8(cols_67)); + + uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]), + vreinterpretq_u8_u32(cols_0415.val[1])); + uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]), + vreinterpretq_u8_u32(cols_2637.val[1])); + uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]), + vreinterpretq_u16_u8(cols_2367.val[0])); + uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]), + vreinterpretq_u16_u8(cols_2367.val[1])); + + uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]); + uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]); + uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]); + uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]); + + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + JSAMPROW outptr4 = output_buf[4] + output_col; + JSAMPROW outptr5 = output_buf[5] + output_col; + JSAMPROW outptr6 = output_buf[6] + output_col; + JSAMPROW outptr7 = output_buf[7] + output_col; + + /* Store DCT block to memory. */ + vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0); + vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0); + vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0); + vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0); + vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1); + vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1); + vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1); + vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c new file mode 100644 index 0000000000..043b652e6c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c @@ -0,0 +1,802 @@ +/* + * jidctint-neon.c - accurate integer IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "jconfigint.h" +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" +#include "neon-compat.h" + +#include + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +/* The computation of the inverse DCT requires the use of constants known at + * compile time. Scaled integer constants are used to avoid floating-point + * arithmetic: + * 0.298631336 = 2446 * 2^-13 + * 0.390180644 = 3196 * 2^-13 + * 0.541196100 = 4433 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.175875602 = 9633 * 2^-13 + * 1.501321110 = 12299 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 1.961570560 = 16069 * 2^-13 + * 2.053119869 = 16819 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * 3.072711026 = 25172 * 2^-13 + */ + +#define F_0_298 2446 +#define F_0_390 3196 +#define F_0_541 4433 +#define F_0_765 6270 +#define F_0_899 7373 +#define F_1_175 9633 +#define F_1_501 12299 +#define F_1_847 15137 +#define F_1_961 16069 +#define F_2_053 16819 +#define F_2_562 20995 +#define F_3_072 25172 + +#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961) +#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390) +#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847) +#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562) +#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899) +#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899) +#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562) +#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765) + + +ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = { + F_0_899, F_0_541, + F_2_562, F_0_298_MINUS_0_899, + F_1_501_MINUS_0_899, F_2_053_MINUS_2_562, + F_0_541_PLUS_0_765, F_1_175, + F_1_175_MINUS_0_390, F_0_541_MINUS_1_847, + F_3_072_MINUS_2_562, F_1_175_MINUS_1_961, + 0, 0, 0, 0 +}; + + +/* Forward declaration of regular and sparse IDCT helper functions */ + +static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t row4, + int16x4_t row5, + int16x4_t row6, + int16x4_t row7, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16x4_t quant_row4, + int16x4_t quant_row5, + int16x4_t quant_row6, + int16x4_t quant_row7, + int16_t *workspace_1, + int16_t *workspace_2); + +static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16_t *workspace_1, + int16_t *workspace_2); + +static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset); + +static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset); + + +/* Perform dequantization and inverse DCT on one block of coefficients. For + * reference, the C implementation (jpeg_idct_slow()) can be found in + * jidctint.c. + * + * Optimization techniques used for fast data access: + * + * In each pass, the inverse DCT is computed for the left and right 4x8 halves + * of the DCT block. This avoids spilling due to register pressure, and the + * increased granularity allows for an optimized calculation depending on the + * values of the DCT coefficients. Between passes, intermediate data is stored + * in 4x8 workspace buffers. + * + * Transposing the 8x8 DCT block after each pass can be achieved by transposing + * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the + * diagram below.) Swapping quadrants is cheap, since the second pass can just + * swap the workspace buffer pointers. + * + * +-------+-------+ +-------+-------+ + * | | | | | | + * | 0 | 1 | | 0 | 2 | + * | | | transpose | | | + * +-------+-------+ ------> +-------+-------+ + * | | | | | | + * | 2 | 3 | | 1 | 3 | + * | | | | | | + * +-------+-------+ +-------+-------+ + * + * Optimization techniques used to accelerate the inverse DCT calculation: + * + * In a DCT coefficient block, the coefficients are increasingly likely to be 0 + * as you move diagonally from top left to bottom right. If whole rows of + * coefficients are 0, then the inverse DCT calculation can be simplified. On + * the first pass of the inverse DCT, we test for three special cases before + * defaulting to a full "regular" inverse DCT: + * + * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a + * "sparse" simplified inverse DCT on rows 0-3. + * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT + * result is equal to the dequantized DC coefficients. + * 3) AC and DC coefficients are all zero. In this case, the inverse DCT + * result is all zero. For the left 4x8 half, this is handled identically + * to Case 2 above. For the right 4x8 half, we do no work and signal that + * the "sparse" algorithm is required for the second pass. + * + * In the second pass, only a single special case is tested: whether the AC and + * DC coefficients were all zero in the right 4x8 block during the first pass + * (refer to Case 3 above.) If this is the case, then a "sparse" variant of + * the second pass is performed for both the left and right halves of the DCT + * block. (The transposition after the first pass means that the right 4x8 + * block during the first pass becomes rows 4-7 during the second pass.) + */ + +void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + int16_t workspace_l[8 * DCTSIZE / 2]; + int16_t workspace_r[8 * DCTSIZE / 2]; + + /* Compute IDCT first pass on left 4x8 coefficient block. */ + + /* Load DCT coefficients in left 4x8 block. */ + int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE); + int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE); + int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE); + int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE); + int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE); + int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE); + int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE); + int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table for left 4x8 block. */ + int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE); + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */ + int16x4_t bitmap = vorr_s16(row7, row6); + bitmap = vorr_s16(bitmap, row5); + bitmap = vorr_s16(bitmap, row4); + int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (bitmap_rows_4567 == 0) { + bitmap = vorr_s16(bitmap, row3); + bitmap = vorr_s16(bitmap, row2); + bitmap = vorr_s16(bitmap, row1); + int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (left_ac_bitmap == 0) { + int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS); + int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } }; + /* Store 4x4 blocks to workspace, transposing in the process. */ + vst4_s16(workspace_l, quadrant); + vst4_s16(workspace_r, quadrant); + } else { + jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0, + quant_row1, quant_row2, quant_row3, + workspace_l, workspace_r); + } + } else { + jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5, + row6, row7, quant_row0, quant_row1, + quant_row2, quant_row3, quant_row4, + quant_row5, quant_row6, quant_row7, + workspace_l, workspace_r); + } + + /* Compute IDCT first pass on right 4x8 coefficient block. */ + + /* Load DCT coefficients in right 4x8 block. */ + row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4); + row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4); + row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4); + row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4); + row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4); + row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4); + row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4); + row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4); + + /* Load quantization table for right 4x8 block. */ + quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4); + quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); + quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */ + bitmap = vorr_s16(row7, row6); + bitmap = vorr_s16(bitmap, row5); + bitmap = vorr_s16(bitmap, row4); + bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + bitmap = vorr_s16(bitmap, row3); + bitmap = vorr_s16(bitmap, row2); + bitmap = vorr_s16(bitmap, row1); + int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + /* If this remains non-zero, a "regular" second pass will be performed. */ + int64_t right_ac_dc_bitmap = 1; + + if (right_ac_bitmap == 0) { + bitmap = vorr_s16(bitmap, row0); + right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (right_ac_dc_bitmap != 0) { + int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS); + int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } }; + /* Store 4x4 blocks to workspace, transposing in the process. */ + vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant); + vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant); + } + } else { + if (bitmap_rows_4567 == 0) { + jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0, + quant_row1, quant_row2, quant_row3, + workspace_l + 4 * DCTSIZE / 2, + workspace_r + 4 * DCTSIZE / 2); + } else { + jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5, + row6, row7, quant_row0, quant_row1, + quant_row2, quant_row3, quant_row4, + quant_row5, quant_row6, quant_row7, + workspace_l + 4 * DCTSIZE / 2, + workspace_r + 4 * DCTSIZE / 2); + } + } + + /* Second pass: compute IDCT on rows in workspace. */ + + /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */ + if (right_ac_dc_bitmap == 0) { + jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0); + jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4); + } else { + jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0); + jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4); + } +} + + +/* Perform dequantization and the first pass of the accurate inverse DCT on a + * 4x8 block of coefficients. (To process the full 8x8 DCT block, this + * function-- or some other optimized variant-- needs to be called for both the + * left and right 4x8 blocks.) + * + * This "regular" version assumes that no optimization can be made to the IDCT + * calculation, since no useful set of AC coefficients is all 0. + * + * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be + * found in jidctint.c. Algorithmic changes made here are documented inline. + */ + +static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t row4, + int16x4_t row5, + int16x4_t row6, + int16x4_t row7, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16x4_t quant_row4, + int16x4_t quant_row5, + int16x4_t quant_row6, + int16x4_t quant_row7, + int16_t *workspace_1, + int16_t *workspace_2) +{ + /* Load constants for IDCT computation. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); +#else + const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + /* Even part */ + int16x4_t z2_s16 = vmul_s16(row2, quant_row2); + int16x4_t z3_s16 = vmul_s16(row6, quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1); + tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1); + + z2_s16 = vmul_s16(row0, quant_row0); + z3_s16 = vmul_s16(row4, quant_row4); + + int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part */ + int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7); + int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5); + int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3); + int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1); + + z3_s16 = vadd_s16(tmp0_s16, tmp2_s16); + int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16); + + /* Implementation as per jpeg_idct_islow() in jidctint.c: + * z5 = (z3 + z4) * 1.175875602; + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + * z3 += z5; z4 += z5; + * + * This implementation: + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + */ + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + /* Implementation as per jpeg_idct_islow() in jidctint.c: + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + * tmp0 += z1 + z3; tmp1 += z2 + z4; + * tmp2 += z2 + z3; tmp3 += z1 + z4; + * + * This implementation: + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + * tmp0 += z3; tmp1 += z4; + * tmp2 += z3; tmp3 += z4; + */ + + tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3); + tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1); + tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2); + tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0); + + tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2); + tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2); + tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0); + + tmp0 = vaddq_s32(tmp0, z3); + tmp1 = vaddq_s32(tmp1, z4); + tmp2 = vaddq_s32(tmp2, z3); + tmp3 = vaddq_s32(tmp3, z4); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x4x4_t rows_0123 = { { + vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1) + } }; + int16x4x4_t rows_4567 = { { + vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1) + } }; + + /* Store 4x4 blocks to the intermediate workspace, ready for the second pass. + * (VST4 transposes the blocks. We need to operate on rows in the next + * pass.) + */ + vst4_s16(workspace_1, rows_0123); + vst4_s16(workspace_2, rows_4567); +} + + +/* Perform dequantization and the first pass of the accurate inverse DCT on a + * 4x8 block of coefficients. + * + * This "sparse" version assumes that the AC coefficients in rows 4-7 are all + * 0. This simplifies the IDCT calculation, accelerating overall performance. + */ + +static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16_t *workspace_1, + int16_t *workspace_2) +{ + /* Load constants for IDCT computation. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); +#else + const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + /* Even part (z3 is all 0) */ + int16x4_t z2_s16 = vmul_s16(row2, quant_row2); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + + z2_s16 = vmul_s16(row0, quant_row0); + int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part (tmp0 and tmp1 are both all 0) */ + int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3); + int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1); + + int16x4_t z3_s16 = tmp2_s16; + int16x4_t z4_s16 = tmp3_s16; + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2); + tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2); + tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x4x4_t rows_0123 = { { + vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1) + } }; + int16x4x4_t rows_4567 = { { + vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1) + } }; + + /* Store 4x4 blocks to the intermediate workspace, ready for the second pass. + * (VST4 transposes the blocks. We need to operate on rows in the next + * pass.) + */ + vst4_s16(workspace_1, rows_0123); + vst4_s16(workspace_2, rows_4567); +} + + +/* Perform the second pass of the accurate inverse DCT on a 4x8 block of + * coefficients. (To process the full 8x8 DCT block, this function-- or some + * other optimized variant-- needs to be called for both the right and left 4x8 + * blocks.) + * + * This "regular" version assumes that no optimization can be made to the IDCT + * calculation, since no useful set of coefficient values are all 0 after the + * first pass. + * + * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow()) + * can be found in jidctint.c. Algorithmic changes made here are documented + * inline. + */ + +static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset) +{ + /* Load constants for IDCT computation. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); +#else + const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + /* Even part */ + int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2); + int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1); + tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1); + + z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2); + z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2); + + int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part */ + int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2); + int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2); + int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2); + int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2); + + z3_s16 = vadd_s16(tmp0_s16, tmp2_s16); + int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16); + + /* Implementation as per jpeg_idct_islow() in jidctint.c: + * z5 = (z3 + z4) * 1.175875602; + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + * z3 += z5; z4 += z5; + * + * This implementation: + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + */ + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + /* Implementation as per jpeg_idct_islow() in jidctint.c: + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + * tmp0 += z1 + z3; tmp1 += z2 + z4; + * tmp2 += z2 + z3; tmp3 += z1 + z4; + * + * This implementation: + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + * tmp0 += z3; tmp1 += z4; + * tmp2 += z3; tmp3 += z4; + */ + + tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3); + tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1); + tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2); + tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0); + + tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2); + tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2); + tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0); + + tmp0 = vaddq_s32(tmp0, z3); + tmp1 = vaddq_s32(tmp1, z4); + tmp2 = vaddq_s32(tmp2, z3); + tmp3 = vaddq_s32(tmp3, z4); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3), + vaddhn_s32(tmp12, tmp1)); + int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2), + vaddhn_s32(tmp13, tmp0)); + int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0), + vsubhn_s32(tmp11, tmp2)); + int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1), + vsubhn_s32(tmp10, tmp3)); + /* Descale and narrow to 8-bit. */ + int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16); + int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16); + int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16); + int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16); + /* Clamp to range [0-255]. */ + uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8), + vdup_n_u8(CENTERJSAMPLE)); + + /* Transpose 4x8 block and store to memory. (Zipping adjacent columns + * together allows us to store 16-bit elements.) + */ + uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8); + uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8); + uint16x4x4_t cols_01_23_45_67 = { { + vreinterpret_u16_u8(cols_01_23.val[0]), + vreinterpret_u16_u8(cols_01_23.val[1]), + vreinterpret_u16_u8(cols_45_67.val[0]), + vreinterpret_u16_u8(cols_45_67.val[1]) + } }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; + /* VST4 of 16-bit elements completes the transpose. */ + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); +} + + +/* Performs the second pass of the accurate inverse DCT on a 4x8 block + * of coefficients. + * + * This "sparse" version assumes that the coefficient values (after the first + * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation, + * accelerating overall performance. + */ + +static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset) +{ + /* Load constants for IDCT computation. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); +#else + const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + /* Even part (z3 is all 0) */ + int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + + z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2); + int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part (tmp0 and tmp1 are both all 0) */ + int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2); + int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2); + + int16x4_t z3_s16 = tmp2_s16; + int16x4_t z4_s16 = tmp3_s16; + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2); + tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2); + tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3), + vaddhn_s32(tmp12, tmp1)); + int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2), + vaddhn_s32(tmp13, tmp0)); + int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0), + vsubhn_s32(tmp11, tmp2)); + int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1), + vsubhn_s32(tmp10, tmp3)); + /* Descale and narrow to 8-bit. */ + int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16); + int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16); + int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16); + int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16); + /* Clamp to range [0-255]. */ + uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8), + vdup_n_u8(CENTERJSAMPLE)); + + /* Transpose 4x8 block and store to memory. (Zipping adjacent columns + * together allows us to store 16-bit elements.) + */ + uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8); + uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8); + uint16x4x4_t cols_01_23_45_67 = { { + vreinterpret_u16_u8(cols_01_23.val[0]), + vreinterpret_u16_u8(cols_01_23.val[1]), + vreinterpret_u16_u8(cols_45_67.val[0]), + vreinterpret_u16_u8(cols_45_67.val[1]) + } }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; + /* VST4 of 16-bit elements completes the transpose. */ + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c new file mode 100644 index 0000000000..be9627e61d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c @@ -0,0 +1,486 @@ +/* + * jidctred-neon.c - reduced-size IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "align.h" +#include "neon-compat.h" + +#include + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_211 1730 +#define F_0_509 4176 +#define F_0_601 4926 +#define F_0_720 5906 +#define F_0_765 6270 +#define F_0_850 6967 +#define F_0_899 7373 +#define F_1_061 8697 +#define F_1_272 10426 +#define F_1_451 11893 +#define F_1_847 15137 +#define F_2_172 17799 +#define F_2_562 20995 +#define F_3_624 29692 + + +/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size + * 2x2 output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original jpeg_idct_2x2() function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.720959822 = 5906 * 2^-13 + * 0.850430095 = 6967 * 2^-13 + * 1.272758580 = 10426 * 2^-13 + * 3.624509785 = 29692 * 2^-13 + * + * See jidctred.c for further details of the 2x2 IDCT algorithm. Where + * possible, the variable names and comments here in jsimd_idct_2x2_neon() + * match up with those in jpeg_idct_2x2(). + */ + +ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = { + -F_0_720, F_0_850, -F_1_272, F_3_624 +}; + +void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table values. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Dequantize DCT coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + row1 = vmulq_s16(row1, quant_row1); + row3 = vmulq_s16(row3, quant_row3); + row5 = vmulq_s16(row5, quant_row5); + row7 = vmulq_s16(row7, quant_row7); + + /* Load IDCT conversion constants. */ + const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts); + + /* Pass 1: process columns from input, put results in vectors row0 and + * row1. + */ + + /* Even part */ + int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2); + int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2); + + /* Odd part */ + int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0); + int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS)); + row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS)); + + /* Transpose two rows, ready for second pass. */ + int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1); + int16x8_t cols_0246 = cols_0246_1357.val[0]; + int16x8_t cols_1357 = cols_0246_1357.val[1]; + /* Duplicate columns such that each is accessible in its own vector. */ + int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357), + vreinterpretq_s32_s16(cols_1357)); + int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]); + int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]); + + /* Pass 2: process two rows, store to output array. */ + + /* Even part: we're only interested in col0; the top half of tmp10 is "don't + * care." + */ + int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2); + + /* Odd part: we're only interested in the bottom half of tmp0. */ + int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3); + tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2); + tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1); + tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0); + + /* Final output stage: descale and clamp to range [0-255]. */ + int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0), + vsubhn_s32(tmp10, tmp0)); + output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16, + CONST_BITS + PASS1_BITS + 3 + 2 - 16); + /* Narrow to 8-bit and convert to unsigned. */ + uint8x8_t output_u8 = vqmovun_s16(output_s16); + + /* Store 2x2 block to memory. */ + vst1_lane_u8(output_buf[0] + output_col, output_u8, 0); + vst1_lane_u8(output_buf[1] + output_col, output_u8, 1); + vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4); + vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5); +} + + +/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size + * 4x4 output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original jpeg_idct_4x4() function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.211164243 = 1730 * 2^-13 + * 0.509795579 = 4176 * 2^-13 + * 0.601344887 = 4926 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.061594337 = 8697 * 2^-13 + * 1.451774981 = 11893 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 2.172734803 = 17799 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * + * See jidctred.c for further details of the 4x4 IDCT algorithm. Where + * possible, the variable names and comments here in jsimd_idct_4x4_neon() + * match up with those in jpeg_idct_4x4(). + */ + +ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = { + F_1_847, -F_0_765, -F_0_211, F_1_451, + -F_2_172, F_1_061, -F_0_509, -F_0_601, + F_0_899, F_2_562, 0, 0 +}; + +void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table values for DC coefficients. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + /* Dequantize DC coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + + /* Construct bitmap to test if all AC coefficients are 0. */ + int16x8_t bitmap = vorrq_s16(row1, row2); + bitmap = vorrq_s16(bitmap, row3); + bitmap = vorrq_s16(bitmap, row5); + bitmap = vorrq_s16(bitmap, row6); + bitmap = vorrq_s16(bitmap, row7); + + int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); + int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); + + /* Load constants for IDCT computation. */ +#ifdef HAVE_VLD1_S16_X3 + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts); +#else + /* GCC does not currently support the intrinsic vld1__x3(). */ + const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8); + const int16x4x3_t consts = { { consts1, consts2, consts3 } }; +#endif + + if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { + /* All AC coefficients are zero. + * Compute DC values and duplicate into row vectors 0, 1, 2, and 3. + */ + int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS); + row0 = dcval; + row1 = dcval; + row2 = dcval; + row3 = dcval; + } else if (left_ac_bitmap == 0) { + /* AC coefficients are zero for columns 0, 1, 2, and 3. + * Compute DC values for these columns. + */ + int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS); + + /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */ + + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Even part */ + int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1); + + int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2); + int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part */ + int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7); + z2 = vmul_s16(vget_high_s16(row5), quant_row5); + z3 = vmul_s16(vget_high_s16(row3), quant_row3); + int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1); + + tmp0 = vmull_lane_s16(z1, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1); + + tmp2 = vmull_lane_s16(z1, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1)); + row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1)); + row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1)); + row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1)); + } else if (right_ac_bitmap == 0) { + /* AC coefficients are zero for columns 4, 5, 6, and 7. + * Compute DC values for these columns. + */ + int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS); + + /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */ + + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Even part */ + int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1); + + int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2); + int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part */ + int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7); + z2 = vmul_s16(vget_low_s16(row5), quant_row5); + z3 = vmul_s16(vget_low_s16(row3), quant_row3); + int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1); + + tmp0 = vmull_lane_s16(z1, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1); + + tmp2 = vmull_lane_s16(z1, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1), dcval); + row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1), dcval); + row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1), dcval); + row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1), dcval); + } else { + /* All AC coefficients are non-zero; full IDCT calculation required. */ + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Even part */ + int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1); + int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1); + + int16x8_t z2 = vmulq_s16(row2, quant_row2); + int16x8_t z3 = vmulq_s16(row6, quant_row6); + + int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0); + int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1); + + int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l); + int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h); + int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l); + int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h); + + /* Odd part */ + int16x8_t z1 = vmulq_s16(row7, quant_row7); + z2 = vmulq_s16(row5, quant_row5); + z3 = vmulq_s16(row3, quant_row3); + int16x8_t z4 = vmulq_s16(row1, quant_row1); + + tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1); + tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1); + + tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1); + tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h), + CONST_BITS - PASS1_BITS + 1)); + row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h), + CONST_BITS - PASS1_BITS + 1)); + row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h), + CONST_BITS - PASS1_BITS + 1)); + row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h), + CONST_BITS - PASS1_BITS + 1)); + } + + /* Transpose 8x4 block to perform IDCT on rows in second pass. */ + int16x8x2_t row_01 = vtrnq_s16(row0, row1); + int16x8x2_t row_23 = vtrnq_s16(row2, row3); + + int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]), + vreinterpretq_s32_s16(row_23.val[0])); + int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]), + vreinterpretq_s32_s16(row_23.val[1])); + + int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0])); + int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0])); + int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1])); + int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1])); + int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0])); + int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1])); + int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1])); + + /* Commence second pass of IDCT. */ + + /* Even part */ + int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1); + int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part */ + tmp0 = vmull_lane_s16(col7, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1); + + tmp2 = vmull_lane_s16(col7, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1); + + /* Final output stage: descale and clamp to range [0-255]. */ + int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2), + vsubhn_s32(tmp12, tmp0)); + int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0), + vsubhn_s32(tmp10, tmp2)); + output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02, + CONST_BITS + PASS1_BITS + 3 + 1 - 16); + output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13, + CONST_BITS + PASS1_BITS + 3 + 1 - 16); + /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements. + * An interleaving store completes the transpose. + */ + uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02), + vqmovun_s16(output_cols_13)); + uint16x4x2_t output_01_23 = { { + vreinterpret_u16_u8(output_0123.val[0]), + vreinterpret_u16_u8(output_0123.val[1]) + } }; + + /* Store 4x4 block to memory. */ + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0); + vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1); + vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2); + vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c new file mode 100644 index 0000000000..d5d95d89f6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c @@ -0,0 +1,193 @@ +/* + * jquanti-neon.c - sample data conversion and quantization (Arm Neon) + * + * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include + + +/* After downsampling, the resulting sample values are in the range [0, 255], + * but the Discrete Cosine Transform (DCT) operates on values centered around + * 0. + * + * To prepare sample values for the DCT, load samples into a DCT workspace, + * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127], + * are also widened from 8- to 16-bit. + * + * The equivalent scalar C function convsamp() can be found in jcdctmgr.c. + */ + +void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col); + uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col); + uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col); + uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col); + uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col); + uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col); + uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col); + uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col); + + int16x8_t row0 = + vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row1 = + vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row2 = + vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row3 = + vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row4 = + vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row5 = + vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row6 = + vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row7 = + vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE))); + + vst1q_s16(workspace + 0 * DCTSIZE, row0); + vst1q_s16(workspace + 1 * DCTSIZE, row1); + vst1q_s16(workspace + 2 * DCTSIZE, row2); + vst1q_s16(workspace + 3 * DCTSIZE, row3); + vst1q_s16(workspace + 4 * DCTSIZE, row4); + vst1q_s16(workspace + 5 * DCTSIZE, row5); + vst1q_s16(workspace + 6 * DCTSIZE, row6); + vst1q_s16(workspace + 7 * DCTSIZE, row7); +} + + +/* After the DCT, the resulting array of coefficient values needs to be divided + * by an array of quantization values. + * + * To avoid a slow division operation, the DCT coefficients are multiplied by + * the (scaled) reciprocals of the quantization values and then right-shifted. + * + * The equivalent scalar C function quantize() can be found in jcdctmgr.c. + */ + +void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + JCOEFPTR out_ptr = coef_block; + UDCTELEM *recip_ptr = (UDCTELEM *)divisors; + UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2; + DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2; + int i; + +#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64)) +#pragma unroll +#endif + for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) { + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE); + int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE); + int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE); + int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE); + /* Load reciprocals of quantization values. */ + uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE); + uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE); + uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE); + uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE); + uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE); + uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE); + uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE); + uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE); + int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE); + int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE); + int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE); + int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE); + + /* Extract sign from coefficients. */ + int16x8_t sign_row0 = vshrq_n_s16(row0, 15); + int16x8_t sign_row1 = vshrq_n_s16(row1, 15); + int16x8_t sign_row2 = vshrq_n_s16(row2, 15); + int16x8_t sign_row3 = vshrq_n_s16(row3, 15); + /* Get absolute value of DCT coefficients. */ + uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0)); + uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1)); + uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2)); + uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3)); + /* Add correction. */ + abs_row0 = vaddq_u16(abs_row0, corr0); + abs_row1 = vaddq_u16(abs_row1, corr1); + abs_row2 = vaddq_u16(abs_row2, corr2); + abs_row3 = vaddq_u16(abs_row3, corr3); + + /* Multiply DCT coefficients by quantization reciprocals. */ + int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0), + vget_low_u16(recip0))); + int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0), + vget_high_u16(recip0))); + int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1), + vget_low_u16(recip1))); + int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1), + vget_high_u16(recip1))); + int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2), + vget_low_u16(recip2))); + int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2), + vget_high_u16(recip2))); + int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3), + vget_low_u16(recip3))); + int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3), + vget_high_u16(recip3))); + /* Narrow back to 16-bit. */ + row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16)); + row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16)); + row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16)); + row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16)); + + /* Since VSHR only supports an immediate as its second argument, negate the + * shift value and shift left. + */ + row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0), + vnegq_s16(shift0))); + row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1), + vnegq_s16(shift1))); + row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2), + vnegq_s16(shift2))); + row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3), + vnegq_s16(shift3))); + + /* Restore sign to original product. */ + row0 = veorq_s16(row0, sign_row0); + row0 = vsubq_s16(row0, sign_row0); + row1 = veorq_s16(row1, sign_row1); + row1 = vsubq_s16(row1, sign_row1); + row2 = veorq_s16(row2, sign_row2); + row2 = vsubq_s16(row2, sign_row2); + row3 = veorq_s16(row3, sign_row3); + row3 = vsubq_s16(row3, sign_row3); + + /* Store quantized coefficients to memory. */ + vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0); + vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1); + vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2); + vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3); + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in b/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in new file mode 100644 index 0000000000..d403f2289f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#cmakedefine HAVE_VLD1_S16_X3 +#cmakedefine HAVE_VLD1_U16_X2 +#cmakedefine HAVE_VLD1Q_U8_X4 + +/* Define compiler-independent count-leading-zeros and byte-swap macros */ +#if defined(_MSC_VER) && !defined(__clang__) +#define BUILTIN_CLZ(x) _CountLeadingZeros(x) +#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x) +#define BUILTIN_BSWAP64(x) _byteswap_uint64(x) +#elif defined(__clang__) || defined(__GNUC__) +#define BUILTIN_CLZ(x) __builtin_clz(x) +#define BUILTIN_CLZLL(x) __builtin_clzll(x) +#define BUILTIN_BSWAP64(x) __builtin_bswap64(x) +#else +#error "Unknown compiler" +#endif diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm new file mode 100644 index 0000000000..c46d684436 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm @@ -0,0 +1,578 @@ +; +; jccolext.asm - colorspace conversion (AVX2) +; +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) + +EXTN(jsimd_rgb_ycc_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + vmovd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub ecx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [esi+ecx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub ecx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub ecx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE + vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO + vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE + vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vmovdqa ymm7, ymm1 + vmovdqa ymm4, ymm6 + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + vpxor ymm1, ymm1, ymm1 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL + vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH + vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) + + vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm7, ymm7, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH + vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO + + vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vmovdqa ymm5, ymm0 + vmovdqa ymm4, ymm6 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + vpxor ymm0, ymm0, ymm0 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL + vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH + vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) + + vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm5, ymm5, ymm0 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm4, ymm4, ymm1 + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH + vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm5, ymm5, ymm7 ; ymm5=Cb + vmovdqu YMMWORD [ebx], ymm5 ; Save Cb + + vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO + vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE + vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vmovdqa ymm7, ymm0 + vmovdqa ymm5, ymm4 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, YMMWORD [wk(4)] + vpaddd ymm4, ymm4, YMMWORD [wk(5)] + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vpxor ymm3, ymm3, ymm3 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL + vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH + vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) + + vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm3 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm5, ymm5, ymm1 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH + vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO + + vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vmovdqa ymm1, ymm6 + vmovdqa ymm5, ymm4 + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(6)] + vpaddd ymm4, ymm4, YMMWORD [wk(7)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [edi], ymm6 ; Save Y + + vpxor ymm2, ymm2, ymm2 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL + vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH + vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) + + vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ] + + vpaddd ymm1, ymm1, ymm2 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm1, ymm1, ymm0 + vpaddd ymm5, ymm5, ymm0 + vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH + vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Cr + vmovdqu YMMWORD [edx], ymm1 ; Save Cr + + sub ecx, byte SIZEOF_YMMWORD + add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add edi, byte SIZEOF_YMMWORD ; outptr0 + add ebx, byte SIZEOF_YMMWORD ; outptr1 + add edx, byte SIZEOF_YMMWORD ; outptr2 + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm new file mode 100644 index 0000000000..6357a42b2c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm @@ -0,0 +1,476 @@ +; +; jccolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx) + +EXTN(jsimd_rgb_ycc_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax, eax + mov al, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx, edx + mov dx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd mmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, dword [esi+ecx] + psllq mmA, DWORD_BIT + por mmA, mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_ycc_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmF, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH, mmH + + movq mmC, mmA + punpcklbw mmA, mmH ; mmA=(00 02 04 06) + punpckhbw mmC, mmH ; mmC=(10 12 14 16) + + movq mmB, mmE + punpcklbw mmE, mmH ; mmE=(20 22 24 26) + punpckhbw mmB, mmH ; mmB=(01 03 05 07) + + movq mmF, mmD + punpcklbw mmD, mmH ; mmD=(11 13 15 17) + punpckhbw mmF, mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF, mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmD, mmA + movq mmC, mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB, mmA + punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG, mmD + punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE, mmA + punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH, mmB + punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF, mmF + + movq mmC, mmA + punpcklbw mmA, mmF ; mmA=(00 02 04 06) + punpckhbw mmC, mmF ; mmC=(10 12 14 16) + + movq mmD, mmB + punpcklbw mmB, mmF ; mmB=(01 03 05 07) + punpckhbw mmD, mmF ; mmD=(11 13 15 17) + + movq mmG, mmE + punpcklbw mmE, mmF ; mmE=(20 22 24 26) + punpckhbw mmG, mmF ; mmG=(30 32 34 36) + + punpcklbw mmF, mmH + punpckhbw mmH, mmH + psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movq MMWORD [wk(0)], mm0 ; wk(0)=RE + movq MMWORD [wk(1)], mm1 ; wk(1)=RO + movq MMWORD [wk(2)], mm4 ; wk(2)=BE + movq MMWORD [wk(3)], mm5 ; wk(3)=BO + + movq mm6, mm1 + punpcklwd mm1, mm3 + punpckhwd mm6, mm3 + movq mm7, mm1 + movq mm4, mm6 + pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor mm1, mm1 + pxor mm6, mm6 + punpcklwd mm1, mm5 ; mm1=BOL + punpckhwd mm6, mm5 ; mm6=BOH + psrld mm1, 1 ; mm1=BOL*FIX(0.500) + psrld mm6, 1 ; mm6=BOH*FIX(0.500) + + movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] + + paddd mm7, mm1 + paddd mm4, mm6 + paddd mm7, mm5 + paddd mm4, mm5 + psrld mm7, SCALEBITS ; mm7=CbOL + psrld mm4, SCALEBITS ; mm4=CbOH + packssdw mm7, mm4 ; mm7=CbO + + movq mm1, MMWORD [wk(2)] ; mm1=BE + + movq mm6, mm0 + punpcklwd mm0, mm2 + punpckhwd mm6, mm2 + movq mm5, mm0 + movq mm4, mm6 + pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor mm0, mm0 + pxor mm6, mm6 + punpcklwd mm0, mm1 ; mm0=BEL + punpckhwd mm6, mm1 ; mm6=BEH + psrld mm0, 1 ; mm0=BEL*FIX(0.500) + psrld mm6, 1 ; mm6=BEH*FIX(0.500) + + movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm5, mm0 + paddd mm4, mm6 + paddd mm5, mm1 + paddd mm4, mm1 + psrld mm5, SCALEBITS ; mm5=CbEL + psrld mm4, SCALEBITS ; mm4=CbEH + packssdw mm5, mm4 ; mm5=CbE + + psllw mm7, BYTE_BIT + por mm5, mm7 ; mm5=Cb + movq MMWORD [ebx], mm5 ; Save Cb + + movq mm0, MMWORD [wk(3)] ; mm0=BO + movq mm6, MMWORD [wk(2)] ; mm6=BE + movq mm1, MMWORD [wk(1)] ; mm1=RO + + movq mm4, mm0 + punpcklwd mm0, mm3 + punpckhwd mm4, mm3 + movq mm7, mm0 + movq mm5, mm4 + pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, MMWORD [wk(4)] + paddd mm4, MMWORD [wk(5)] + paddd mm0, mm3 + paddd mm4, mm3 + psrld mm0, SCALEBITS ; mm0=YOL + psrld mm4, SCALEBITS ; mm4=YOH + packssdw mm0, mm4 ; mm0=YO + + pxor mm3, mm3 + pxor mm4, mm4 + punpcklwd mm3, mm1 ; mm3=ROL + punpckhwd mm4, mm1 ; mm4=ROH + psrld mm3, 1 ; mm3=ROL*FIX(0.500) + psrld mm4, 1 ; mm4=ROH*FIX(0.500) + + movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm7, mm3 + paddd mm5, mm4 + paddd mm7, mm1 + paddd mm5, mm1 + psrld mm7, SCALEBITS ; mm7=CrOL + psrld mm5, SCALEBITS ; mm5=CrOH + packssdw mm7, mm5 ; mm7=CrO + + movq mm3, MMWORD [wk(0)] ; mm3=RE + + movq mm4, mm6 + punpcklwd mm6, mm2 + punpckhwd mm4, mm2 + movq mm1, mm6 + movq mm5, mm4 + pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(6)] + paddd mm4, MMWORD [wk(7)] + paddd mm6, mm2 + paddd mm4, mm2 + psrld mm6, SCALEBITS ; mm6=YEL + psrld mm4, SCALEBITS ; mm4=YEH + packssdw mm6, mm4 ; mm6=YE + + psllw mm0, BYTE_BIT + por mm6, mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + pxor mm2, mm2 + pxor mm4, mm4 + punpcklwd mm2, mm3 ; mm2=REL + punpckhwd mm4, mm3 ; mm4=REH + psrld mm2, 1 ; mm2=REL*FIX(0.500) + psrld mm4, 1 ; mm4=REH*FIX(0.500) + + movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] + + paddd mm1, mm2 + paddd mm5, mm4 + paddd mm1, mm0 + paddd mm5, mm0 + psrld mm1, SCALEBITS ; mm1=CrEL + psrld mm5, SCALEBITS ; mm5=CrEH + packssdw mm1, mm5 ; mm1=CrE + + psllw mm7, BYTE_BIT + por mm1, mm7 ; mm1=Cr + movq MMWORD [edx], mm1 ; Save Cr + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + add ebx, byte SIZEOF_MMWORD ; outptr1 + add edx, byte SIZEOF_MMWORD ; outptr2 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm new file mode 100644 index 0000000000..c6c80852ac --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm @@ -0,0 +1,503 @@ +; +; jccolext.asm - colorspace conversion (SSE2) +; +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm new file mode 100644 index 0000000000..14944e952f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) + +EXTN(jconst_rgb_ycc_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm new file mode 100644 index 0000000000..8cb399bdc4 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_mmx) + +EXTN(jconst_rgb_ycc_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm new file mode 100644 index 0000000000..686d222ff7 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm @@ -0,0 +1,120 @@ +; +; jccolor.asm - colorspace conversion (SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm new file mode 100644 index 0000000000..560ee0c71e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_avx2) + +EXTN(jconst_rgb_gray_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm new file mode 100644 index 0000000000..79fdf082a8 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_mmx) + +EXTN(jconst_rgb_gray_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm new file mode 100644 index 0000000000..cb4b28e8f4 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm @@ -0,0 +1,112 @@ +; +; jcgray.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm new file mode 100644 index 0000000000..3fa7973d72 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm @@ -0,0 +1,457 @@ +; +; jcgryext.asm - grayscale colorspace conversion (AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) + +EXTN(jsimd_rgb_gray_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + vmovd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub ecx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [esi+ecx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub ecx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub ecx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa ymm0, ymm5 ; ymm0=BO + vmovdqa ymm6, ymm4 ; ymm6=BE + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm4, ymm4, ymm7 + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(0)] + vpaddd ymm4, ymm4, YMMWORD [wk(1)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [edi], ymm6 ; Save Y + + sub ecx, byte SIZEOF_YMMWORD + add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add edi, byte SIZEOF_YMMWORD ; outptr0 + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm new file mode 100644 index 0000000000..8af42e5a33 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm @@ -0,0 +1,355 @@ +; +; jcgryext.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax, eax + mov al, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx, edx + mov dx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd mmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, dword [esi+ecx] + psllq mmA, DWORD_BIT + por mmA, mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH, mmH + + movq mmC, mmA + punpcklbw mmA, mmH ; mmA=(00 02 04 06) + punpckhbw mmC, mmH ; mmC=(10 12 14 16) + + movq mmB, mmE + punpcklbw mmE, mmH ; mmE=(20 22 24 26) + punpckhbw mmB, mmH ; mmB=(01 03 05 07) + + movq mmF, mmD + punpcklbw mmD, mmH ; mmD=(11 13 15 17) + punpckhbw mmF, mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF, mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD, mmA + movq mmC, mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB, mmA + punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG, mmD + punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE, mmA + punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH, mmB + punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF, mmF + + movq mmC, mmA + punpcklbw mmA, mmF ; mmA=(00 02 04 06) + punpckhbw mmC, mmF ; mmC=(10 12 14 16) + + movq mmD, mmB + punpcklbw mmB, mmF ; mmB=(01 03 05 07) + punpckhbw mmD, mmF ; mmD=(11 13 15 17) + + movq mmG, mmE + punpcklbw mmE, mmF ; mmE=(20 22 24 26) + punpckhbw mmG, mmF ; mmG=(30 32 34 36) + + punpcklbw mmF, mmH + punpckhbw mmH, mmH + psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6, mm1 + punpcklwd mm1, mm3 + punpckhwd mm6, mm3 + pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6, mm0 + punpcklwd mm0, mm2 + punpckhwd mm6, mm2 + pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4, mm0 + punpcklwd mm0, mm3 + punpckhwd mm4, mm3 + pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0, mm3 + paddd mm4, mm3 + psrld mm0, SCALEBITS ; mm0=YOL + psrld mm4, SCALEBITS ; mm4=YOH + packssdw mm0, mm4 ; mm0=YO + + movq mm4, mm6 + punpcklwd mm6, mm2 + punpckhwd mm4, mm2 + pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6, mm2 + paddd mm4, mm2 + psrld mm6, SCALEBITS ; mm6=YEL + psrld mm4, SCALEBITS ; mm4=YEH + packssdw mm6, mm4 ; mm6=YE + + psllw mm0, BYTE_BIT + por mm6, mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm new file mode 100644 index 0000000000..c9d6ff1e35 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm @@ -0,0 +1,382 @@ +; +; jcgryext.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm new file mode 100644 index 0000000000..278cf5e83a --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm @@ -0,0 +1,761 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (SSE2) +; +; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based on jchuff.c; see jchuff.c for more details. + +%include "jsimdext.inc" + +struc working_state +.next_output_byte: resp 1 ; => next byte to write in buffer +.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer +.cur.put_buffer.simd resq 1 ; current bit accumulation buffer +.cur.free_bits resd 1 ; # of bits available in it +.cur.last_dc_val resd 4 ; last DC coef for each component +.cinfo: resp 1 ; dump_buffer needs access to this +endstruc + +struc c_derived_tbl +.ehufco: resd 256 ; code for each symbol +.ehufsi: resb 256 ; length of code for each symbol +; If no code has been allocated for a symbol S, ehufsi[S] contains 0 +endstruc + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + GLOBAL_DATA(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + + alignz 32 + +jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007 + dq 0x000f, 0x001f, 0x003f, 0x007f + dq 0x00ff, 0x01ff, 0x03ff, 0x07ff + dq 0x0fff, 0x1fff, 0x3fff, 0x7fff + +times 1 << 14 db 15 +times 1 << 13 db 14 +times 1 << 12 db 13 +times 1 << 11 db 12 +times 1 << 10 db 11 +times 1 << 9 db 10 +times 1 << 8 db 9 +times 1 << 7 db 8 +times 1 << 6 db 7 +times 1 << 5 db 6 +times 1 << 4 db 5 +times 1 << 3 db 4 +times 1 << 2 db 3 +times 1 << 1 db 2 +times 1 << 0 db 1 +times 1 db 0 +jpeg_nbits_table: +times 1 db 0 +times 1 << 0 db 1 +times 1 << 1 db 2 +times 1 << 2 db 3 +times 1 << 3 db 4 +times 1 << 4 db 5 +times 1 << 5 db 6 +times 1 << 6 db 7 +times 1 << 7 db 8 +times 1 << 8 db 9 +times 1 << 9 db 10 +times 1 << 10 db 11 +times 1 << 11 db 12 +times 1 << 12 db 13 +times 1 << 13 db 14 +times 1 << 14 db 15 + + alignz 32 + +%ifdef PIC +%define NBITS(x) nbits_base + x +%else +%define NBITS(x) jpeg_nbits_table + x +%endif +%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table) + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%define mm_put_buffer mm0 +%define mm_all_0xff mm1 +%define mm_temp mm2 +%define mm_nbits mm3 +%define mm_code_bits mm3 +%define mm_code mm4 +%define mm_overflow_bits mm5 +%define mm_save_nbits mm6 + +; Shorthand used to describe SIMD operations: +; wN: xmmN treated as eight signed 16-bit values +; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 +; bN: xmmN treated as 16 unsigned 8-bit values, or +; mmN treated as eight unsigned 8-bit values +; bN[i]: perform the same operation on all unsigned 8-bit values, +; i=0..15 (SSE register) or i=0..7 (MMX register) +; Contents of SIMD registers are shown in memory order. + +; Fill the bit buffer to capacity with the leading bits from code, then output +; the bit buffer and put the remaining bits from code into the bit buffer. +; +; Usage: +; code - contains the bits to shift into the bit buffer (LSB-aligned) +; %1 - temp register +; %2 - low byte of temp register +; %3 - second byte of temp register +; %4-%8 (optional) - extra instructions to execute before the macro completes +; %9 - the label to which to jump when the macro completes +; +; Upon completion, free_bits will be set to the number of remaining bits from +; code, and put_buffer will contain those remaining bits. temp and code will +; be clobbered. +; +; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() +; macro in jchuff.c. + +%macro EMIT_QWORD 9 +%define %%temp %1 +%define %%tempb %2 +%define %%temph %3 + add nbits, free_bits ; nbits += free_bits; + neg free_bits ; free_bits = -free_bits; + movq mm_temp, mm_code ; temp = code; + movd mm_nbits, nbits ; nbits --> MMX register + movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits; + neg free_bits ; free_bits = -free_bits; + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits; + add free_bits, 64 ; free_bits += 64; + por mm_temp, mm_put_buffer ; temp |= put_buffer; +%ifidn %%temp, nbits_base + movd mm_save_nbits, nbits_base ; save nbits_base +%endif + movq mm_code_bits, mm_temp ; code_bits (temp register) = temp; + movq mm_put_buffer, mm_code ; put_buffer = code; + pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0); + movq mm_code, mm_code_bits ; code = code_bits; + psrlq mm_code_bits, 32 ; code_bits >>= 32; + pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i); + movd %%temp, mm_code_bits ; temp = code_bits; + bswap %%temp ; temp = htonl(temp); + test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */ + jnz %%.SLOW ; goto %%.SLOW + mov dword [buffer], %%temp ; *(uint32_t)buffer = temp; +%ifidn %%temp, nbits_base + movd nbits_base, mm_save_nbits ; restore nbits_base +%endif + %4 + movd nbits, mm_code ; nbits = (uint32_t)(code); + %5 + bswap nbits ; nbits = htonl(nbits); + mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits; + lea buffer, [buffer + 8] ; buffer += 8; + %6 + %7 + %8 + jmp %9 ; return +%%.SLOW: + ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 + ; bytes in the qword. + mov byte [buffer], %%tempb ; buffer[0] = temp[0]; + cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], %%temph ; buffer[0] = temp[1]; + cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr %%temp, 16 ; temp >>= 16; + mov byte [buffer], %%tempb ; buffer[0] = temp[0]; + cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], %%temph ; buffer[0] = temp[1]; + cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code) +%ifidn %%temp, nbits_base + movd nbits_base, mm_save_nbits ; restore nbits_base +%endif + bswap nbits ; nbits = htonl(nbits) + mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; + cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); + mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; + cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); + shr nbits, 16 ; nbits >>= 16; + mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; + cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); + mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; + %4 + cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); + %5 + %6 + %7 + %8 + jmp %9 ; return; +%endmacro + +%macro PUSH 1 + push %1 +%assign stack_offset stack_offset + 4 +%endmacro + +%macro POP 1 + pop %1 +%assign stack_offset stack_offset - 4 +%endmacro + +; If PIC is defined, load the address of a symbol defined in this file into a +; register. Equivalent to +; get_GOT %1 +; lea %1, [GOTOFF(%1, %2)] +; without using the GOT. +; +; Usage: +; %1 - register into which to load the address of the symbol +; %2 - symbol whose address should be loaded +; %3 - optional multi-line macro to execute before the symbol address is loaded +; %4 - optional multi-line macro to execute after the symbol address is loaded +; +; If PIC is not defined, then %3 and %4 are executed in order. + +%macro GET_SYM 2-4 +%ifdef PIC + call %%.geteip +%%.ref: + %4 + add %1, %2 - %%.ref + jmp short %%.done + align 32 +%%.geteip: + %3 4 ; must adjust stack pointer because of call + mov %1, POINTER [esp] + ret + align 32 +%%.done: +%else + %3 0 + %4 +%endif +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET *) +; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; +; Stack layout: +; Function args +; Return address +; Saved ebx +; Saved ebp +; Saved esi +; Saved edi <-- esp_save +; ... +; esp_save +; t_ 64*2 bytes (aligned to 128 bytes) +; +; esp is used (as t) to point into t_ (data in lower indices is not used once +; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows +; us to find the rest of the data again. +; +; NOTES: +; When shuffling data, we try to avoid pinsrw as much as possible, since it is +; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on +; modern CPUs, so chains of pinsrw instructions (even with different outputs) +; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and +; requires 2 µops (with memory operand) on Intel. In either case, only one +; pinsrw instruction can be decoded per cycle (and nothing else if they are +; back-to-back), so out-of-order execution cannot be used to work around long +; pinsrw chains (though for Sandy Bridge and later, this may be less of a +; problem if the code runs from the µop cache.) +; +; We use tzcnt instead of bsf without checking for support. The instruction is +; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to +; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is +; an input dependency (although the behavior is not formally defined, Intel +; CPUs usually leave the destination unmodified if the source is zero.) This +; can prevent out-of-order execution, so we clear the destination before +; invoking tzcnt. +; +; Initial register allocation +; eax - frame --> buffer +; ebx - nbits_base (PIC) / emit_temp +; ecx - dctbl --> size --> state +; edx - block --> nbits +; esi - code_temp --> state --> actbl +; edi - index_temp --> free_bits +; esp - t +; ebp - index + +%define frame eax +%ifdef PIC +%define nbits_base ebx +%endif +%define emit_temp ebx +%define emit_tempb bl +%define emit_temph bh +%define dctbl ecx +%define block edx +%define code_temp esi +%define index_temp edi +%define t esp +%define index ebp + +%assign save_frame DCTSIZE2 * SIZEOF_WORD + +; Step 1: Re-arrange input data according to jpeg_natural_order +; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 +; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 +; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 +; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 +; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 +; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 +; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 +; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 + + align 32 + GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + +%assign stack_offset 0 +%define arg_state 4 + stack_offset +%define arg_buffer 8 + stack_offset +%define arg_block 12 + stack_offset +%define arg_last_dc_val 16 + stack_offset +%define arg_dctbl 20 + stack_offset +%define arg_actbl 24 + stack_offset + + ;X: X = code stream + mov block, [esp + arg_block] + PUSH ebx + PUSH ebp + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 + PUSH esi + PUSH edi + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + mov frame, esp + lea t, [frame - (save_frame + 4)] + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0] + mov [t + save_frame], frame + pxor xmm4, xmm4 ;A: w4[i] = 0; + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 + pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 + punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 + punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 + pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 + ;A: (Row 0, offset 1) + pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); + paddw xmm0, xmm4 ;A: w0[i] += w4[i]; + movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; + + movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- + pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- + pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 + movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 + movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 + punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 + pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 + pxor xmm4, xmm4 ;A: w4[i] = 0; + psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- + pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); + pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 + ; (Row 1, offset 1) + pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm4 ;B: w1[i] += w4[i]; + movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; + pxor xmm4, xmm4 ;B: w4[i] = 0; + pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); + + packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] + ; w/ signed saturation + + pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- + pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- + pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- + pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 + ; (Row 3, offset 1) + pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); + paddw xmm3, xmm4 ;D: w3[i] += w4[i]; + movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; + pxor xmm4, xmm4 ;D: w4[i] = 0; + pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); + + pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 + pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 + pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 + pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 + pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 + pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 + ; (Row 2, offset 1) + pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); + paddw xmm2, xmm4 ;C: w2[i] += w4[i]; + movsx code_temp, word [block] ;Z: code_temp = block[0]; + +; %1 - stack pointer adjustment +%macro GET_SYM_BEFORE 1 + movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2 + ;C: t[i+16] = w2[i]; + pxor xmm4, xmm4 ;C: w4[i] = 0; + pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); + sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val; + + packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] + ; w/ signed saturation + + movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 + pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i); + pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i); + movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 + punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 + shl index_temp, 16 ;Z: index_temp <<= 16; + psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- + pxor xmm2, xmm2 ;H: w2[i] = 0; + pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- + or index, index_temp ;Z: index |= index_temp; +%undef index_temp +%define free_bits edi +%endmacro + +%macro GET_SYM_AFTER 0 + movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- + unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 + pxor xmm0, xmm0 ;H: w0[i] = 0; + not index ;Z: index = ~index; + pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- + ; (Row 7, offset 1) + pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); + mov dctbl, [frame + arg_dctbl] + paddw xmm3, xmm2 ;H: w3[i] += w2[i]; + movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; + movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- + pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); + punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 + movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 + pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF; +%endmacro + + GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER + + psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- + shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 + pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- + pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 + pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- + pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 + cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000, + ;Z: i.e. if code_temp is positive + pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- + movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 + pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- + pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 + pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 + ; (Row 6, offset 1) + adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0); + pxor xmm2, xmm2 ;G: w2[i] = 0; + pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); + pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 + paddw xmm4, xmm0 ;G: w4[i] += w0[i]; + movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; + movd mm_temp, code_temp ;Z: temp = code_temp + pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 + ; (Row 5, offset 1) + pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); + + packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] + ; w/ signed saturation + + lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1] + pxor xmm0, xmm0 ;F: w0[i] = 0; + pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm2 ;F: w1[i] += w2[i]; + movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; + pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); + pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 + pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 + pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 + pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 + ; (Row 4, offset 1) +%undef block +%define nbits edx +%define nbitsb dl +%define nbitsh dh + movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp); +%undef code_temp +%define state esi + pxor xmm2, xmm2 ;E: w2[i] = 0; + mov state, [frame + arg_state] + movd mm_nbits, nbits ;Z: nbits --> MMX register + pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); + movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4] + ;Z: code = dctbl->ehufco[nbits]; +%define size ecx +%define sizeb cl +%define sizeh ch + paddw xmm5, xmm0 ;E: w5[i] += w0[i]; + movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; + movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits] + ;Z: size = dctbl->ehufsi[nbits]; +%undef dctbl + pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); + + packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] + ; w/ signed saturation + + movq mm_put_buffer, [state + working_state.cur.put_buffer.simd] + ;Z: put_buffer = state->cur.put_buffer.simd; + mov free_bits, [state + working_state.cur.free_bits] + ;Z: free_bits = state->cur.free_bits; +%undef state +%define actbl esi + mov actbl, [frame + arg_actbl] +%define buffer eax + mov buffer, [frame + arg_buffer] +%undef frame + jmp .BEGIN + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +; size <= 32, so this is not really a loop +.BRLOOP1: ; .BRLOOP1: + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + and index, 0x7ffffff ; clear index if size == 32 + sub size, 16 ; size -= 16; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1; + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; + jmp .ERLOOP1 ; goto .ERLOOP1; + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +%ifdef PIC + times 6 nop +%else + times 2 nop +%endif +.BLOOP1: ; do { /* size = # of zero bits/elements to skip */ +; if size == 32, index remains unchanged. Correct in .BRLOOP. + shr index, sizeb ; index >>= size; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size > 16) + jg .BRLOOP1 ; goto .BRLOOP1; +.ERLOOP1: ; .ERLOOP1: + movsx nbits, word [t] ; nbits = *t; +%ifdef PIC + add size, size ; size += size; +%else + lea size, [size * 2] ; size += size; +%endif + movd mm_temp, nbits ; temp = nbits; + movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); + lea size, [size * 8 + nbits] ; size = size * 8 + nbits; + movd mm_nbits, nbits ; nbits --> MMX register + movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] + ; code = actbl->ehufco[size-16]; + movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] + ; size = actbl->ehufsi[size-16]; +.BEGIN: ; .BEGIN: + pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; + psllq mm_code, mm_nbits ; code <<= nbits; + add nbits, size ; nbits += size; + por mm_code, mm_temp ; code |= temp; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1 + xor size, size ; size = 0; /* kill tzcnt input dependency */ + tzcnt size, index ; size = # of trailing 0 bits in index + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + inc size ; ++size; + por mm_put_buffer, mm_code ; put_buffer |= code; + test index, index + jnz .BLOOP1 ; } while (index != 0); +; Round 2 +; t points to the last used word, possibly below t_ if the previous index had 32 zero bits. +.ELOOP1: ; .ELOOP1: + pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i); + pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i); + shl size, 16 ; size <<= 16; + or index, size ; index |= size; + not index ; index = ~index; + lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD] + ; nbits = t + 1 + 64; + and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */ + sub nbits, t ; nbits -= t; + shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */ + tzcnt size, index ; size = # of trailing 0 bits in index + inc size ; ++size; + test index, index ; if (index == 0) + jz .ELOOP2 ; goto .ELOOP2; +; NOTE: size == 32 cannot happen, since the last element is always 0. + shr index, sizeb ; index >>= size; + lea size, [size + nbits - 33] ; size = size + nbits - 33; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size <= 16) + jle .ERLOOP2 ; goto .ERLOOP2; +.BRLOOP2: ; do { + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + sub size, 16 ; size -= 16; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP2 ; insert code and flush put_buffer + movd mm_nbits, nbits ; else { nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; + cmp size, 16 ; if (size <= 16) + jle .ERLOOP2 ; goto .ERLOOP2; + jmp .BRLOOP2 ; } while (1); + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.BLOOP2: ; do { /* size = # of zero bits/elements to skip */ + shr index, sizeb ; index >>= size; + lea t, [t + size * SIZEOF_WORD] ; t += size; + cmp size, 16 ; if (size > 16) + jg .BRLOOP2 ; goto .BRLOOP2; +.ERLOOP2: ; .ERLOOP2: + movsx nbits, word [t] ; nbits = *t; + add size, size ; size += size; + movd mm_temp, nbits ; temp = nbits; + movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); + movd mm_nbits, nbits ; nbits --> MMX register + lea size, [size * 8 + nbits] ; size = size * 8 + nbits; + movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] + ; code = actbl->ehufco[size-16]; + movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] + ; size = actbl->ehufsi[size-16]; + psllq mm_code, mm_nbits ; code <<= nbits; + pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; + lea nbits, [nbits + size] ; nbits += size; + por mm_code, mm_temp ; code |= temp; + xor size, size ; size = 0; /* kill tzcnt input dependency */ + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2 + tzcnt size, index ; size = # of trailing 0 bits in index + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + inc size ; ++size; + por mm_put_buffer, mm_code ; put_buffer |= code; + test index, index + jnz .BLOOP2 ; } while (index != 0); +.ELOOP2: ; .ELOOP2: + mov nbits, t ; nbits = t; + lea t, [t + SIZEOF_WORD] ; t = &t[1]; + and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127; + and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */ + cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2) + je .EFN ; { + movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0] + ; code = actbl->ehufco[0]; + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] + ; nbits = actbl->ehufsi[0]; + sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) + jg .EFN_SKIP_EMIT_CODE ; { + EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer + align 16 +.EFN_SKIP_EMIT_CODE: ; } else { + movd mm_nbits, nbits ; nbits --> MMX register + psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; + por mm_put_buffer, mm_code ; put_buffer |= code; +.EFN: ; } } +%define frame esp + mov frame, [t + save_frame] +%define state ecx + mov state, [frame + arg_state] + movq [state + working_state.cur.put_buffer.simd], mm_put_buffer + ; state->cur.put_buffer.simd = put_buffer; + emms + mov [state + working_state.cur.free_bits], free_bits + ; state->cur.free_bits = free_bits; + POP edi + POP esi + POP ebp + POP ebx + ret + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP1: + EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \ + .ERLOOP1 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_ERLOOP1: + EMIT_QWORD size, sizeb, sizeh, \ + { xor size, size }, \ + { tzcnt size, index }, \ + { inc size }, \ + { test index, index }, \ + { jnz .BLOOP1 }, \ + .ELOOP1 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP2: + EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \ + { cmp size, 16 }, \ + { jle .ERLOOP2 }, \ + .BRLOOP2 + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_ERLOOP2: + EMIT_QWORD size, sizeb, sizeh, \ + { xor size, size }, \ + { tzcnt size, index }, \ + { inc size }, \ + { test index, index }, \ + { jnz .BLOOP2 }, \ + .ELOOP2 + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm new file mode 100644 index 0000000000..c26b48a47d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm @@ -0,0 +1,662 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + mov T1, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl ecx, 16 + shl esi, 16 + + or eax, ecx + or edx, esi + + not eax + not edx + + mov edi, ZEROBITS + + mov INT [edi], eax + mov INT [edi+SIZEOF_INT], edx +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *zerobits + +%define ZERO xmm7 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define LEN ebp + +%define ZEROBITS INT [esp + 5 * 4] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 4 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov ZEROBITS, T0 + mov LEN, INT [eax + 16] + pxor ZERO, ZERO + mov K, LEN + and K, -16 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 + test LEN, 15 + je .PADDING +.ELOOP16: + mov LENEND, LEN + and LENEND, 7 + + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *bits + +%define ZERO xmm7 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T0w cx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define KK ebp + +%define ZEROBITS INT [esp + 5 * 4] +%define EOB INT [esp + 5 * 4 + 4] +%define LEN INT [esp + 5 * 4 + 8] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 16 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov K, INT [eax + 16] + mov INT [T0 + 2 * SIZEOF_INT], -1 + mov INT [T0 + 3 * SIZEOF_INT], -1 + mov ZEROBITS, T0 + mov LEN, K + pxor ZERO, ZERO + and K, -16 + mov EOB, 0 + xor KK, KK + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 2 + dec K + jnz .BLOOPR16 + test LEN, 15 + je .PADDINGR +.ELOOPR16: + mov LENEND, LEN + + test LENEND, 8 + jz .TRYR7 + test LENEND, 7 + jz .TRYR8 + + and LENEND, 7 + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + and LENEND, 7 + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + mov eax, EOB + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm new file mode 100644 index 0000000000..0a20802dd8 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm @@ -0,0 +1,388 @@ +; +; jcsample.asm - downsampling (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) + +EXTN(jsimd_h2v1_downsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + vmovd xmm7, edx + vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r24: + ; ecx can possibly be 8, 16, 24 + cmp ecx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD] + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp ecx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD] + +.downsample: + vpsrlw ymm2, ymm0, BYTE_BIT + vpand ymm0, ymm0, ymm6 + vpsrlw ymm3, ymm1, BYTE_BIT + vpand ymm1, ymm1, ymm6 + + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + + sub ecx, byte SIZEOF_YMMWORD ; outcol + add esi, byte 2*SIZEOF_YMMWORD ; inptr + add edi, byte 1*SIZEOF_YMMWORD ; outptr + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + test ecx, ecx + jnz near .columnloop_r24 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) + +EXTN(jsimd_h2v2_downsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + vmovd xmm7, edx + vpcmpeqw ymm6, ymm6, ymm6 + vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} + vperm2i128 ymm7, ymm7, ymm7, 0 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r24: + cmp ecx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD] + vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD] + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp ecx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD] + vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD] + +.downsample: + vpand ymm4, ymm0, ymm6 + vpsrlw ymm0, ymm0, BYTE_BIT + vpand ymm5, ymm1, ymm6 + vpsrlw ymm1, ymm1, BYTE_BIT + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + + vpand ymm4, ymm2, ymm6 + vpsrlw ymm2, ymm2, BYTE_BIT + vpand ymm5, ymm3, ymm6 + vpsrlw ymm3, ymm3, BYTE_BIT + vpaddw ymm2, ymm2, ymm4 + vpaddw ymm3, ymm3, ymm5 + + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpsrlw ymm0, ymm0, 2 + vpsrlw ymm2, ymm2, 2 + + vpackuswb ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + + sub ecx, byte SIZEOF_YMMWORD ; outcol + add edx, byte 2*SIZEOF_YMMWORD ; inptr0 + add esi, byte 2*SIZEOF_YMMWORD ; inptr1 + add edi, byte 1*SIZEOF_YMMWORD ; outptr + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .columnloop_r24 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm new file mode 100644 index 0000000000..2c223eebe8 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm @@ -0,0 +1,324 @@ +; +; jcsample.asm - downsampling (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx) + +EXTN(jsimd_h2v1_downsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd mm7, edx + pcmpeqw mm6, mm6 + punpckldq mm7, mm7 ; mm7={0, 1, 0, 1} + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2, mm0 + movq mm3, mm1 + + pand mm0, mm6 + psrlw mm2, BYTE_BIT + pand mm1, mm6 + psrlw mm3, BYTE_BIT + + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm7 + paddw mm1, mm7 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz short .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx) + +EXTN(jsimd_h2v2_downsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd mm7, edx + pcmpeqw mm6, mm6 + punpckldq mm7, mm7 ; mm7={1, 2, 1, 2} + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm4, mm0 + movq mm5, mm1 + pand mm0, mm6 + psrlw mm4, BYTE_BIT + pand mm1, mm6 + psrlw mm5, BYTE_BIT + paddw mm0, mm4 + paddw mm1, mm5 + + movq mm4, mm2 + movq mm5, mm3 + pand mm2, mm6 + psrlw mm4, BYTE_BIT + pand mm3, mm6 + psrlw mm5, BYTE_BIT + paddw mm2, mm4 + paddw mm3, mm5 + + paddw mm0, mm1 + paddw mm2, mm3 + paddw mm0, mm7 + paddw mm2, mm7 + psrlw mm0, 2 + psrlw mm2, 2 + + packuswb mm0, mm2 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add edx, byte 2*SIZEOF_MMWORD ; inptr0 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz near .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm new file mode 100644 index 0000000000..4fea60d2e2 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm @@ -0,0 +1,351 @@ +; +; jcsample.asm - downsampling (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx, ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm new file mode 100644 index 0000000000..015be0416c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm @@ -0,0 +1,515 @@ +; +; jdcolext.asm - colorspace conversion (AVX2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) + +EXTN(jsimd_ycc_rgb_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm0, ymm0, ymm0 + vpcmpeqw ymm7, ymm7, ymm7 + vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..} + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO + vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE + vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO + + vpaddw ymm2, ymm4, ymm7 + vpaddw ymm3, ymm5, ymm7 + vpaddw ymm6, ymm0, ymm7 + vpaddw ymm7, ymm1, ymm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE + vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO + vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE + vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO + + vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800)) + vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800)) + vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200)) + vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)] + vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)] + vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800)) + vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800)) + vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)] + vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)] + vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200)) + vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E + vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O + vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E + vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O + + vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E + vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O + + vpunpckhwd ymm4, ymm2, ymm6 + vpunpcklwd ymm2, ymm2, ymm6 + vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)] + vpunpckhwd ymm5, ymm3, ymm7 + vpunpcklwd ymm3, ymm3, ymm7 + vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)] + + vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm4, ymm4, SCALEBITS + vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm3, ymm3, SCALEBITS + vpsrad ymm5, ymm5, SCALEBITS + + vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm4, ymm4, ymm4 + vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..} + vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO + + vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr0 + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub ecx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH +.out0: + add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr0 + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + cmp ecx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub ecx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp ecx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + vperm2i128 ymmA, ymmA, ymmA, 1 + add edi, byte SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [edi], xmmA + add edi, byte SIZEOF_YMMWORD/16*4 + sub ecx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + vmovd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm new file mode 100644 index 0000000000..5813cfcb66 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm @@ -0,0 +1,404 @@ +; +; jdcolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx) + +EXTN(jsimd_ycc_rgb_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) + + pcmpeqw mm4, mm4 + pcmpeqw mm7, mm7 + psrlw mm4, BYTE_BIT + psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} + + pand mm4, mm5 ; mm4=Cb(0246)=CbE + psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO + pand mm0, mm1 ; mm0=Cr(0246)=CrE + psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO + + paddw mm4, mm7 + paddw mm5, mm7 + paddw mm0, mm7 + paddw mm1, mm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm2, mm4 ; mm2=CbE + movq mm3, mm5 ; mm3=CbO + paddw mm4, mm4 ; mm4=2*CbE + paddw mm5, mm5 ; mm5=2*CbO + movq mm6, mm0 ; mm6=CrE + movq mm7, mm1 ; mm7=CrO + paddw mm0, mm0 ; mm0=2*CrE + paddw mm1, mm1 ; mm1=2*CrO + + pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) + pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) + pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) + pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) + + paddw mm4, [GOTOFF(eax,PW_ONE)] + paddw mm5, [GOTOFF(eax,PW_ONE)] + psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800)) + psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800)) + paddw mm0, [GOTOFF(eax,PW_ONE)] + paddw mm1, [GOTOFF(eax,PW_ONE)] + psraw mm0, 1 ; mm0=(CrE * FIX(0.40200)) + psraw mm1, 1 ; mm1=(CrO * FIX(0.40200)) + + paddw mm4, mm2 + paddw mm5, mm3 + paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E + paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O + paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E + paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O + + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O + + movq mm4, mm2 + movq mm5, mm3 + punpcklwd mm2, mm6 + punpckhwd mm4, mm6 + pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm3, mm7 + punpckhwd mm5, mm7 + pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm2, [GOTOFF(eax,PD_ONEHALF)] + paddd mm4, [GOTOFF(eax,PD_ONEHALF)] + psrad mm2, SCALEBITS + psrad mm4, SCALEBITS + paddd mm3, [GOTOFF(eax,PD_ONEHALF)] + paddd mm5, [GOTOFF(eax,PD_ONEHALF)] + psrad mm3, SCALEBITS + psrad mm5, SCALEBITS + + packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movq mm5, MMWORD [esi] ; mm5=Y(01234567) + + pcmpeqw mm4, mm4 + psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} + pand mm4, mm5 ; mm4=Y(0246)=YE + psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO + + paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) + paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) + packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) + paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) + packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) + packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG, mmA + movq mmH, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC, mmD + movq mmB, mmD + punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF, mmE + punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16, 7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA, mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax, mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov dword [edi+0*SIZEOF_DWORD], eax + psrlq mmA, DWORD_BIT + movd eax, mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi+0*SIZEOF_WORD], ax + shr eax, WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .nextrow + mov byte [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG, mmB + punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD, mmA + punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH, mmC + punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16, 7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA, mmC + movq mmD, mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .nextrow + movd dword [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm new file mode 100644 index 0000000000..d5572b3294 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm @@ -0,0 +1,458 @@ +; +; jdcolext.asm - colorspace conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [GOTOFF(eax,PW_ONE)] + paddw xmm5, [GOTOFF(eax,PW_ONE)] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [GOTOFF(eax,PW_ONE)] + paddw xmm1, [GOTOFF(eax,PW_ONE)] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm4, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm new file mode 100644 index 0000000000..e05b60d001 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm @@ -0,0 +1,118 @@ +; +; jdcolor.asm - colorspace conversion (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) + +EXTN(jconst_ycc_rgb_convert_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2 +%include "jdcolext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm new file mode 100644 index 0000000000..fb7e7bcce4 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_mmx) + +EXTN(jconst_ycc_rgb_convert_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx +%include "jdcolext-mmx.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm new file mode 100644 index 0000000000..b736255317 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm new file mode 100644 index 0000000000..711e6792d0 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm @@ -0,0 +1,136 @@ +; +; jdmerge.asm - merged upsampling/color conversion (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_avx2) + +EXTN(jconst_merged_upsample_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgbx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgbx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgrx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgrx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm new file mode 100644 index 0000000000..6e8311d408 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm @@ -0,0 +1,123 @@ +; +; jdmerge.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_mmx) + +EXTN(jconst_merged_upsample_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm new file mode 100644 index 0000000000..e32f90aa17 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm @@ -0,0 +1,135 @@ +; +; jdmerge.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm new file mode 100644 index 0000000000..e35f7282bc --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm @@ -0,0 +1,575 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (AVX2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2) + +EXTN(jsimd_h2v1_merged_upsample_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpcmpeqw ymm3, ymm3, ymm3 + vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL + vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH + vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL + vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH + + vpaddw ymm5, ymm6, ymm3 + vpaddw ymm2, ymm4, ymm3 + vpaddw ymm1, ymm7, ymm3 + vpaddw ymm3, ymm0, ymm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL + vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH + vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL + + vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800)) + vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800)) + vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200)) + vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)] + vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)] + vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800)) + vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800)) + vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)] + vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)] + vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200)) + vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, ymm5 + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L + vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H + vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L + + vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H + vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H + + vpunpckhwd ymm6, ymm5, ymm1 + vpunpcklwd ymm5, ymm5, ymm1 + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)] + vpunpckhwd ymm7, ymm2, ymm3 + vpunpcklwd ymm2, ymm2, ymm3 + vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)] + + vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm5, ymm5, SCALEBITS + vpsrad ymm6, ymm6, SCALEBITS + vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm7, ymm7, SCALEBITS + + vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H + vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H + vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO + + vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H) + vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H) + vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H) + + vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_YMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub ecx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH +.out0: + add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_YMMWORD ; inptr0 + dec al + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + cmp ecx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub ecx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp ecx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [edi], xmmA + add edi, byte SIZEOF_YMMWORD/16*4 + sub ecx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + vmovd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2) + +EXTN(jsimd_h2v2_merged_upsample_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_avx2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_avx2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm new file mode 100644 index 0000000000..eb3e36b475 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm @@ -0,0 +1,460 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx) + +EXTN(jsimd_h2v1_merged_upsample_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) + + pxor mm1, mm1 ; mm1=(all 0's) + pcmpeqw mm3, mm3 + psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} + + movq mm4, mm6 + punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH + punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL + movq mm0, mm7 + punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH + punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL + + paddw mm6, mm3 + paddw mm4, mm3 + paddw mm7, mm3 + paddw mm0, mm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm5, mm6 ; mm5=CbH + movq mm2, mm4 ; mm2=CbL + paddw mm6, mm6 ; mm6=2*CbH + paddw mm4, mm4 ; mm4=2*CbL + movq mm1, mm7 ; mm1=CrH + movq mm3, mm0 ; mm3=CrL + paddw mm7, mm7 ; mm7=2*CrH + paddw mm0, mm0 ; mm0=2*CrL + + pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) + pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) + pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) + pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) + + paddw mm6, [GOTOFF(eax,PW_ONE)] + paddw mm4, [GOTOFF(eax,PW_ONE)] + psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800)) + psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800)) + paddw mm7, [GOTOFF(eax,PW_ONE)] + paddw mm0, [GOTOFF(eax,PW_ONE)] + psraw mm7, 1 ; mm7=(CrH * FIX(0.40200)) + psraw mm0, 1 ; mm0=(CrL * FIX(0.40200)) + + paddw mm6, mm5 + paddw mm4, mm2 + paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H + paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L + paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H + paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L + + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H + + movq mm6, mm5 + movq mm7, mm2 + punpcklwd mm5, mm1 + punpckhwd mm6, mm1 + pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm2, mm3 + punpckhwd mm7, mm3 + pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm5, [GOTOFF(eax,PD_ONEHALF)] + paddd mm6, [GOTOFF(eax,PD_ONEHALF)] + psrad mm5, SCALEBITS + psrad mm6, SCALEBITS + paddd mm2, [GOTOFF(eax,PD_ONEHALF)] + paddd mm7, [GOTOFF(eax,PD_ONEHALF)] + psrad mm2, SCALEBITS + psrad mm7, SCALEBITS + + packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + movq mm7, MMWORD [esi] ; mm7=Y(01234567) + + pcmpeqw mm6, mm6 + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + pand mm6, mm7 ; mm6=Y(0246)=YE + psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO + + movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H) + movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H) + movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H) + + paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) + paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) + packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) + paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) + packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) + paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) + packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG, mmA + movq mmH, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC, mmD + movq mmB, mmD + punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF, mmE + punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz near .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA, mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax, mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov dword [edi+0*SIZEOF_DWORD], eax + psrlq mmA, DWORD_BIT + movd eax, mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi+0*SIZEOF_WORD], ax + shr eax, WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .endcolumn + mov byte [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG, mmB + punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD, mmA + punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH, mmC + punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA, mmC + movq mmD, mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .endcolumn + movd dword [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx) + +EXTN(jsimd_h2v2_merged_upsample_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, JDIMENSION [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm new file mode 100644 index 0000000000..c113dc4d27 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm @@ -0,0 +1,517 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [GOTOFF(eax,PW_ONE)] + paddw xmm4, [GOTOFF(eax,PW_ONE)] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [GOTOFF(eax,PW_ONE)] + paddw xmm0, [GOTOFF(eax,PW_ONE)] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm6, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm7, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm new file mode 100644 index 0000000000..a800c35e08 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm @@ -0,0 +1,760 @@ +; +; jdsample.asm - upsampling (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_avx2) + +EXTN(jconst_fancy_upsample_avx2): + +PW_ONE times 16 dw 1 +PW_TWO times 16 dw 2 +PW_THREE times 16 dw 3 +PW_SEVEN times 16 dw 7 +PW_EIGHT times 16 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) + +EXTN(jsimd_h2v1_fancy_upsample_avx2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_YMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + vpcmpeqb xmm7, xmm7, xmm7 + vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff + vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD] + + add eax, byte SIZEOF_YMMWORD-1 + and eax, byte -SIZEOF_YMMWORD + cmp eax, byte SIZEOF_YMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + vpcmpeqb xmm6, xmm6, xmm6 + vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1) + vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff + vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] + vperm2i128 ymm6, ymm0, ymm6, 0x20 + vpslldq ymm6, ymm6, 15 + +.upsample: + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) + + vperm2i128 ymm2, ymm0, ymm1, 0x20 + vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) + vperm2i128 ymm4, ymm0, ymm1, 0x03 + vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) + + vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) + vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) + + vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) + + vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) + vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) + vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) + vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) + vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + + vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)] + vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)] + vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)] + + vpaddw ymm2, ymm2, ymm1 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm3, ymm3, ymm1 + vpaddw ymm6, ymm6, ymm4 + vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm3, ymm3, BYTE_BIT + vpsllw ymm6, ymm6, BYTE_BIT + vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5 + + sub eax, byte SIZEOF_YMMWORD + add esi, byte 1*SIZEOF_YMMWORD ; inptr + add edi, byte 2*SIZEOF_YMMWORD ; outptr + cmp eax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) + +EXTN(jsimd_h2v2_fancy_upsample_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_YMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] + vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] + vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) + + vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + + vpcmpeqb xmm7, xmm7, xmm7 + vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6 + + vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + + vmovdqa YMMWORD [wk(0)], ymm1 + vmovdqa YMMWORD [wk(1)], ymm2 + + poppic ebx + + add eax, byte SIZEOF_YMMWORD-1 + and eax, byte -SIZEOF_YMMWORD + cmp eax, byte SIZEOF_YMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpcmpeqb xmm1, xmm1, xmm1 + vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) + vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff + + vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD] + vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD] + + vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + + jmp near .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] + vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] + vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) + + vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6 + + vperm2i128 ymm1, ymm3, ymm1, 0x20 + vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + vperm2i128 ymm2, ymm3, ymm2, 0x20 + vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + + vmovdqa YMMWORD [wk(2)], ymm1 + vmovdqa YMMWORD [wk(3)], ymm2 + +.upsample: + ; -- process the upper row + + vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + + vperm2i128 ymm0, ymm1, ymm7, 0x03 + vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm4, ymm1, ymm3, 0x20 + vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm5, ymm1, ymm7, 0x03 + vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm6, ymm1, ymm3, 0x20 + vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm2, ymm1, ymm3, 0x03 + vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm4, ymm1, ymm3, 0x03 + vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm1, ymm7, 0x20 + vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(0)], ymm4 + + vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)] + vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)] + + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm5, ymm5, ymm3 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm0, ymm0, BYTE_BIT + vpsllw ymm2, ymm2, BYTE_BIT + vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 + + ; -- process the lower row + + vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + + vperm2i128 ymm7, ymm1, ymm6, 0x03 + vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm3, ymm1, ymm4, 0x20 + vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm0, ymm1, ymm6, 0x03 + vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm2, ymm1, ymm4, 0x20 + vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm5, ymm1, ymm4, 0x03 + vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm3, ymm1, ymm4, 0x03 + vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm1, ymm6, 0x20 + vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(1)], ymm3 + + vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)] + + vpaddw ymm1, ymm1, ymm6 + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm7, ymm7, ymm6 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm7, ymm7, BYTE_BIT + vpsllw ymm5, ymm5, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) + vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 + + poppic ebx + + sub eax, byte SIZEOF_YMMWORD + add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_YMMWORD ; inptr0 + add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_YMMWORD ; outptr0 + add edi, byte 2*SIZEOF_YMMWORD ; outptr1 + cmp eax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) + +EXTN(jsimd_h2v1_upsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (SIZEOF_YMMWORD-1) + and edx, -SIZEOF_YMMWORD + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + cmp eax, byte SIZEOF_YMMWORD + ja near .above_16 + + vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + jmp short .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 + + sub eax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr + add edi, byte 2*SIZEOF_YMMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) + +EXTN(jsimd_h2v2_upsample_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (SIZEOF_YMMWORD-1) + and edx, -SIZEOF_YMMWORD + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + cmp eax, byte SIZEOF_YMMWORD + ja short .above_16 + + vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + jmp near .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 + + sub eax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr + add ebx, 2*SIZEOF_YMMWORD ; outptr0 + add edi, 2*SIZEOF_YMMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm new file mode 100644 index 0000000000..12c49f0eab --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm @@ -0,0 +1,731 @@ +; +; jdsample.asm - upsampling (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_mmx) + +EXTN(jconst_fancy_upsample_mmx): + +PW_ONE times 4 dw 1 +PW_TWO times 4 dw 2 +PW_THREE times 4 dw 3 +PW_SEVEN times 4 dw 7 +PW_EIGHT times 4 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx) + +EXTN(jsimd_h2v1_fancy_upsample_mmx): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_MMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor mm0, mm0 ; mm0=(all 0's) + pcmpeqb mm7, mm7 + psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + pcmpeqb mm6, mm6 + psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] + psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT + +.upsample: + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, mm1 + movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7) + psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) + psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) + + por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6) + por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8) + + movq mm7, mm1 + psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) + + movq mm4, mm1 + punpcklbw mm1, mm0 ; mm1=( 0 1 2 3) + punpckhbw mm4, mm0 ; mm4=( 4 5 6 7) + movq mm5, mm2 + punpcklbw mm2, mm0 ; mm2=(-1 0 1 2) + punpckhbw mm5, mm0 ; mm5=( 3 4 5 6) + movq mm6, mm3 + punpcklbw mm3, mm0 ; mm3=( 1 2 3 4) + punpckhbw mm6, mm0 ; mm6=( 5 6 7 8) + + pmullw mm1, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + paddw mm2, [GOTOFF(ebx,PW_ONE)] + paddw mm5, [GOTOFF(ebx,PW_ONE)] + paddw mm3, [GOTOFF(ebx,PW_TWO)] + paddw mm6, [GOTOFF(ebx,PW_TWO)] + + paddw mm2, mm1 + paddw mm5, mm4 + psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6) + psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14) + paddw mm3, mm1 + paddw mm6, mm4 + psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7) + psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15) + + psllw mm3, BYTE_BIT + psllw mm6, BYTE_BIT + por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) + por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 + + sub eax, byte SIZEOF_MMWORD + add esi, byte 1*SIZEOF_MMWORD ; inptr + add edi, byte 2*SIZEOF_MMWORD ; outptr + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx) + +EXTN(jsimd_h2v2_fancy_upsample_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_MMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3, mm3 ; mm3=(all 0's) + movq mm4, mm0 + punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3) + punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7) + movq mm5, mm1 + punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3) + punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7) + movq mm6, mm2 + punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3) + punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7) + + pmullw mm0, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + + pcmpeqb mm7, mm7 + psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT + + paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 + + pand mm1, mm7 ; mm1=( 0 - - -) + pand mm2, mm7 ; mm2=( 0 - - -) + + movq MMWORD [wk(0)], mm1 + movq MMWORD [wk(1)], mm2 + + poppic ebx + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb mm1, mm1 + psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT + movq mm2, mm1 + + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + + jmp short .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3, mm3 ; mm3=(all 0's) + movq mm4, mm0 + punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3) + punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7) + movq mm5, mm1 + punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3) + punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7) + movq mm6, mm2 + punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3) + punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7) + + pmullw mm0, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + + paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 + + psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) + psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + +.upsample: + ; -- process the upper row + + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) + + movq mm0, mm7 + movq mm4, mm3 + psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -) + psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) + movq mm5, mm7 + movq mm6, mm3 + psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) + psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6) + + por mm0, mm4 ; mm0=( 1 2 3 4) + por mm5, mm6 ; mm5=( 3 4 5 6) + + movq mm1, mm7 + movq mm2, mm3 + psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -) + movq mm4, mm3 + psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) + + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) + + movq MMWORD [wk(0)], mm4 + + pmullw mm7, [GOTOFF(ebx,PW_THREE)] + pmullw mm3, [GOTOFF(ebx,PW_THREE)] + paddw mm1, [GOTOFF(ebx,PW_EIGHT)] + paddw mm5, [GOTOFF(ebx,PW_EIGHT)] + paddw mm0, [GOTOFF(ebx,PW_SEVEN)] + paddw mm2, [GOTOFF(ebx,PW_SEVEN)] + + paddw mm1, mm7 + paddw mm5, mm3 + psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6) + psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14) + paddw mm0, mm7 + paddw mm2, mm3 + psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7) + psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15) + + psllw mm0, BYTE_BIT + psllw mm2, BYTE_BIT + por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) + por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 + + ; -- process the lower row + + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) + + movq mm7, mm6 + movq mm3, mm4 + psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -) + psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) + movq mm0, mm6 + movq mm2, mm4 + psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) + psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6) + + por mm7, mm3 ; mm7=( 1 2 3 4) + por mm0, mm2 ; mm0=( 3 4 5 6) + + movq mm1, mm6 + movq mm5, mm4 + psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -) + movq mm3, mm4 + psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) + + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) + + movq MMWORD [wk(1)], mm3 + + pmullw mm6, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + paddw mm1, [GOTOFF(ebx,PW_EIGHT)] + paddw mm0, [GOTOFF(ebx,PW_EIGHT)] + paddw mm7, [GOTOFF(ebx,PW_SEVEN)] + paddw mm5, [GOTOFF(ebx,PW_SEVEN)] + + paddw mm1, mm6 + paddw mm0, mm4 + psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6) + psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14) + paddw mm7, mm6 + paddw mm5, mm4 + psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7) + psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15) + + psllw mm7, BYTE_BIT + psllw mm5, BYTE_BIT + por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) + por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 + + poppic ebx + + sub eax, byte SIZEOF_MMWORD + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_MMWORD ; outptr0 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx) + +EXTN(jsimd_h2v1_upsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1, mm0 + punpcklbw mm0, mm0 + punpckhbw mm1, mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3, mm2 + punpcklbw mm2, mm2 + punpckhbw mm3, mm3 + + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 4*SIZEOF_MMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx) + +EXTN(jsimd_h2v2_upsample_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1, mm0 + punpcklbw mm0, mm0 + punpckhbw mm1, mm1 + + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3, mm2 + punpcklbw mm2, mm2 + punpckhbw mm3, mm3 + + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm new file mode 100644 index 0000000000..4e28d2f4b8 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm @@ -0,0 +1,724 @@ +; +; jdsample.asm - upsampling (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm2, [GOTOFF(ebx,PW_ONE)] + paddw xmm5, [GOTOFF(ebx,PW_ONE)] + paddw xmm3, [GOTOFF(ebx,PW_TWO)] + paddw xmm6, [GOTOFF(ebx,PW_TWO)] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + poppic ebx + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 + + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [GOTOFF(ebx,PW_THREE)] + pmullw xmm3, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm new file mode 100644 index 0000000000..322ab16325 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm @@ -0,0 +1,318 @@ +; +; jfdctflt.asm - floating-point FDCT (3DNow!) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_3dnow) + +EXTN(jconst_fdct_float_3dnow): + +PD_0_382 times 2 dd 0.382683432365089771728460 +PD_0_707 times 2 dd 0.707106781186547524400844 +PD_0_541 times 2 dd 0.541196100146196984399723 +PD_1_306 times 2 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_3dnow(FAST_FLOAT *data) +; + +%define data(b) (b) + 8 ; FAST_FLOAT *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_3dnow) + +EXTN(jsimd_fdct_float_3dnow): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) + + movq mm4, mm0 ; transpose coefficients + punpckldq mm0, mm1 ; mm0=(00 10)=data0 + punpckhdq mm4, mm1 ; mm4=(01 11)=data1 + movq mm5, mm2 ; transpose coefficients + punpckldq mm2, mm3 ; mm2=(06 16)=data6 + punpckhdq mm5, mm3 ; mm5=(07 17)=data7 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm2 ; mm4=data1-data6=tmp6 + pfsub mm0, mm5 ; mm0=data0-data7=tmp7 + pfadd mm6, mm2 ; mm6=data1+data6=tmp1 + pfadd mm7, mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4, mm1 ; transpose coefficients + punpckldq mm1, mm3 ; mm1=(02 12)=data2 + punpckhdq mm4, mm3 ; mm4=(03 13)=data3 + movq mm0, mm2 ; transpose coefficients + punpckldq mm2, mm5 ; mm2=(04 14)=data4 + punpckhdq mm0, mm5 ; mm0=(05 15)=data5 + + movq mm3, mm4 + movq mm5, mm1 + pfadd mm4, mm2 ; mm4=data3+data4=tmp3 + pfadd mm1, mm0 ; mm1=data2+data5=tmp2 + pfsub mm3, mm2 ; mm3=data3-data4=tmp4 + pfsub mm5, mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2, mm7 + movq mm0, mm6 + pfsub mm7, mm4 ; mm7=tmp13 + pfsub mm6, mm1 ; mm6=tmp12 + pfadd mm2, mm4 ; mm2=tmp10 + pfadd mm0, mm1 ; mm0=tmp11 + + pfadd mm6, mm7 + pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4, mm2 + movq mm1, mm7 + pfsub mm2, mm0 ; mm2=data4 + pfsub mm7, mm6 ; mm7=data6 + pfadd mm4, mm0 ; mm4=data0 + pfadd mm1, mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3, mm5 ; mm3=tmp10 + pfadd mm5, mm0 ; mm5=tmp11 + pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2, mm3 ; mm2=tmp10 + pfsub mm3, mm0 + pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2, mm3 ; mm2=z2 + pfadd mm0, mm3 ; mm0=z4 + + movq mm7, mm6 + pfsub mm6, mm5 ; mm6=z13 + pfadd mm7, mm5 ; mm7=z11 + + movq mm4, mm6 + movq mm1, mm7 + pfsub mm6, mm2 ; mm6=data3 + pfsub mm7, mm0 ; mm7=data7 + pfadd mm4, mm2 ; mm4=data5 + pfadd mm1, mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) + + movq mm4, mm0 ; transpose coefficients + punpckldq mm0, mm1 ; mm0=(00 01)=data0 + punpckhdq mm4, mm1 ; mm4=(10 11)=data1 + movq mm5, mm2 ; transpose coefficients + punpckldq mm2, mm3 ; mm2=(60 61)=data6 + punpckhdq mm5, mm3 ; mm5=(70 71)=data7 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm2 ; mm4=data1-data6=tmp6 + pfsub mm0, mm5 ; mm0=data0-data7=tmp7 + pfadd mm6, mm2 ; mm6=data1+data6=tmp1 + pfadd mm7, mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4, mm1 ; transpose coefficients + punpckldq mm1, mm3 ; mm1=(20 21)=data2 + punpckhdq mm4, mm3 ; mm4=(30 31)=data3 + movq mm0, mm2 ; transpose coefficients + punpckldq mm2, mm5 ; mm2=(40 41)=data4 + punpckhdq mm0, mm5 ; mm0=(50 51)=data5 + + movq mm3, mm4 + movq mm5, mm1 + pfadd mm4, mm2 ; mm4=data3+data4=tmp3 + pfadd mm1, mm0 ; mm1=data2+data5=tmp2 + pfsub mm3, mm2 ; mm3=data3-data4=tmp4 + pfsub mm5, mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2, mm7 + movq mm0, mm6 + pfsub mm7, mm4 ; mm7=tmp13 + pfsub mm6, mm1 ; mm6=tmp12 + pfadd mm2, mm4 ; mm2=tmp10 + pfadd mm0, mm1 ; mm0=tmp11 + + pfadd mm6, mm7 + pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4, mm2 + movq mm1, mm7 + pfsub mm2, mm0 ; mm2=data4 + pfsub mm7, mm6 ; mm7=data6 + pfadd mm4, mm0 ; mm4=data0 + pfadd mm1, mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3, mm5 ; mm3=tmp10 + pfadd mm5, mm0 ; mm5=tmp11 + pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2, mm3 ; mm2=tmp10 + pfsub mm3, mm0 + pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2, mm3 ; mm2=z2 + pfadd mm0, mm3 ; mm0=z4 + + movq mm7, mm6 + pfsub mm6, mm5 ; mm6=z13 + pfadd mm7, mm5 ; mm7=z11 + + movq mm4, mm6 + movq mm1, mm7 + pfsub mm6, mm2 ; mm6=data3 + pfsub mm7, mm0 ; mm7=data7 + pfadd mm4, mm2 ; mm4=data5 + pfadd mm1, mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + + femms ; empty MMX/3DNow! state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm new file mode 100644 index 0000000000..86952c6499 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm @@ -0,0 +1,369 @@ +; +; jfdctflt.asm - floating-point FDCT (SSE) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse(FAST_FLOAT *data) +; + +%define data(b) (b) + 8 ; FAST_FLOAT *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm new file mode 100644 index 0000000000..80645a50d7 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm @@ -0,0 +1,395 @@ +; +; jfdctfst.asm - fast integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_mmx) + +EXTN(jconst_fdct_ifast_mmx): + +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_mmx(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx) + +EXTN(jsimd_fdct_ifast_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(24 34 25 35) + punpckhwd mm5, mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm4, mm7 ; mm4=(02 12 03 13) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(04 14 05 15) + punpckhwd mm2, mm3 ; mm2=(06 16 07 17) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + psubw mm5, mm7 ; mm5=tmp13 + psubw mm0, mm4 ; mm0=tmp12 + paddw mm1, mm7 ; mm1=tmp10 + paddw mm6, mm4 ; mm6=tmp11 + + paddw mm0, mm5 + psllw mm0, PRE_MULTIPLY_SCALE_BITS + pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7, mm1 + movq mm4, mm5 + psubw mm1, mm6 ; mm1=data4 + psubw mm5, mm0 ; mm5=data6 + paddw mm7, mm6 ; mm7=data0 + paddw mm4, mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2, mm3 ; mm2=tmp10 + paddw mm3, mm6 ; mm3=tmp11 + paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm6, PRE_MULTIPLY_SCALE_BITS + + psllw mm3, PRE_MULTIPLY_SCALE_BITS + pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1, mm2 ; mm1=tmp10 + psubw mm2, mm6 + pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1, mm2 ; mm1=z2 + paddw mm6, mm2 ; mm6=z4 + + movq mm5, mm0 + psubw mm0, mm3 ; mm0=z13 + paddw mm5, mm3 ; mm5=z11 + + movq mm7, mm0 + movq mm4, mm5 + psubw mm0, mm1 ; mm0=data3 + psubw mm5, mm6 ; mm5=data7 + paddw mm7, mm1 ; mm7=data5 + paddw mm4, mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(02 03 12 13) + punpckhwd mm4, mm1 ; mm4=(22 23 32 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(42 43 52 53) + punpckhwd mm5, mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 01 10 11) + punpckhwd mm4, mm7 ; mm4=(20 21 30 31) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(40 41 50 51) + punpckhwd mm2, mm3 ; mm2=(60 61 70 71) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + psubw mm5, mm7 ; mm5=tmp13 + psubw mm0, mm4 ; mm0=tmp12 + paddw mm1, mm7 ; mm1=tmp10 + paddw mm6, mm4 ; mm6=tmp11 + + paddw mm0, mm5 + psllw mm0, PRE_MULTIPLY_SCALE_BITS + pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7, mm1 + movq mm4, mm5 + psubw mm1, mm6 ; mm1=data4 + psubw mm5, mm0 ; mm5=data6 + paddw mm7, mm6 ; mm7=data0 + paddw mm4, mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2, mm3 ; mm2=tmp10 + paddw mm3, mm6 ; mm3=tmp11 + paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm6, PRE_MULTIPLY_SCALE_BITS + + psllw mm3, PRE_MULTIPLY_SCALE_BITS + pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1, mm2 ; mm1=tmp10 + psubw mm2, mm6 + pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1, mm2 ; mm1=z2 + paddw mm6, mm2 ; mm6=z4 + + movq mm5, mm0 + psubw mm0, mm3 ; mm0=z13 + paddw mm5, mm3 ; mm5=z11 + + movq mm7, mm0 + movq mm4, mm5 + psubw mm0, mm1 ; mm0=data3 + psubw mm5, mm6 ; mm5=data7 + paddw mm7, mm1 ; mm7=data5 + paddw mm4, mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm new file mode 100644 index 0000000000..446fa7a68f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm @@ -0,0 +1,403 @@ +; +; jfdctfst.asm - fast integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm new file mode 100644 index 0000000000..23cf733135 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm @@ -0,0 +1,331 @@ +; +; jfdctint.asm - accurate integer FDCT (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + vpunpcklwd %5, %1, %2 + vpunpckhwd %6, %1, %2 + vpunpcklwd %7, %3, %4 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 1) + ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) + ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) + ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) + ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) + + vpunpckldq %1, %5, %7 + vpunpckhdq %2, %5, %7 + vpunpckldq %3, %6, %8 + vpunpckhdq %4, %6, %8 + ; transpose coefficients(phase 2) + ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpermq %1, %1, 0x8D + vpermq %2, %2, 0x8D + vpermq %3, %3, 0xD8 + vpermq %4, %4, 0xD8 + ; transpose coefficients(phase 3) + ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70) + ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 9 + vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 + vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 + vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 + vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5 + + ; -- Even part + + vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1 + vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11 + vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12 + + vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 + vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11 + vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) +%if %9 == 1 + vpsllw %1, %7, PASS1_BITS ; %1=data0_4 +%else + vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)] + vpsraw %1, %7, PASS1_BITS ; %1=data0_4 +%endif + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13 + vpunpcklwd %2, %6, %7 + vpunpckhwd %6, %6, %7 + vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L + vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H + + vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %6, %6, DESCALE_P %+ %9 + + vpackssdw %3, %2, %6 ; %6=data2_6 + + ; -- Odd part + + vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3 + vpunpcklwd %6, %7, %2 + vpunpckhwd %7, %7, %2 + vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L + vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6 + vpunpcklwd %2, %8, %4 + vpunpckhwd %4, %8, %4 + vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L + vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H + + vpaddd %2, %2, %6 ; %2=data7_5L + vpaddd %4, %4, %7 ; %4=data7_5H + + vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %4, %4, DESCALE_P %+ %9 + + vpackssdw %4, %2, %4 ; %4=data7_5 + + vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4 + vpunpcklwd %8, %5, %2 + vpunpckhwd %5, %5, %2 + vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L + vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H + + vpaddd %8, %8, %6 ; %8=data3_1L + vpaddd %5, %5, %7 ; %5=data3_1H + + vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %8, %8, DESCALE_P %+ %9 + vpsrad %5, %5, DESCALE_P %+ %9 + + vpackssdw %2, %8, %5 ; %2=data3_1 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_avx2) + +EXTN(jconst_fdct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562 + times 4 dw (F_1_501 - F_0_899), -F_0_899 +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_avx2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_avx2) + +EXTN(jsimd_fdct_islow_avx2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(ebp)] ; (DCTELEM *) + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm4, ymm6, 0x31 + vperm2i128 ymm2, ymm5, ymm7, 0x20 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 + + ; ---- Pass 2: process columns. + + vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 + vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 + + vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 + vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3 + vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5 + vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3 + vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5 + vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6 + vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7 + + vzeroupper +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm new file mode 100644 index 0000000000..34a43b9e5e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm @@ -0,0 +1,620 @@ +; +; jfdctint.asm - accurate integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_mmx) + +EXTN(jconst_fdct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_mmx(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_mmx) + +EXTN(jsimd_fdct_islow_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(24 34 25 35) + punpckhwd mm5, mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm4, mm7 ; mm4=(02 12 03 13) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(04 14 05 15) + punpckhwd mm2, mm3 ; mm2=(06 16 07 17) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + paddw mm5, mm7 ; mm5=tmp10 + paddw mm0, mm4 ; mm0=tmp11 + psubw mm1, mm7 ; mm1=tmp13 + psubw mm6, mm4 ; mm6=tmp12 + + movq mm7, mm5 + paddw mm5, mm0 ; mm5=tmp10+tmp11 + psubw mm7, mm0 ; mm7=tmp10-tmp11 + + psllw mm5, PASS1_BITS ; mm5=data0 + psllw mm7, PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4, mm1 ; mm1=tmp13 + movq mm0, mm1 + punpcklwd mm4, mm6 ; mm6=tmp12 + punpckhwd mm0, mm6 + movq mm1, mm4 + movq mm6, mm0 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4, DESCALE_P1 + psrad mm0, DESCALE_P1 + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm4, mm0 ; mm4=data2 + packssdw mm1, mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0, mm2 ; mm2=tmp4 + movq mm6, mm3 ; mm3=tmp5 + paddw mm0, mm5 ; mm0=z3 + paddw mm6, mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4, mm0 + movq mm1, mm0 + punpcklwd mm4, mm6 + punpckhwd mm1, mm6 + movq mm0, mm4 + movq mm6, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4, mm2 + movq mm1, mm2 + punpcklwd mm4, mm7 + punpckhwd mm1, mm7 + movq mm2, mm4 + movq mm7, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2, mm0 ; mm2=data1L + paddd mm7, mm6 ; mm7=data1H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4, DESCALE_P1 + psrad mm1, DESCALE_P1 + paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm2, DESCALE_P1 + psrad mm7, DESCALE_P1 + + packssdw mm4, mm1 ; mm4=data7 + packssdw mm2, mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1, mm3 + movq mm7, mm3 + punpcklwd mm1, mm5 + punpckhwd mm7, mm5 + movq mm3, mm1 + movq mm5, mm7 + pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1, mm0 ; mm1=data5L + paddd mm7, mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1, DESCALE_P1 + psrad mm7, DESCALE_P1 + paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm3, DESCALE_P1 + psrad mm5, DESCALE_P1 + + packssdw mm1, mm7 ; mm1=data5 + packssdw mm3, mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(02 03 12 13) + punpckhwd mm4, mm1 ; mm4=(22 23 32 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(42 43 52 53) + punpckhwd mm5, mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 01 10 11) + punpckhwd mm4, mm7 ; mm4=(20 21 30 31) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(40 41 50 51) + punpckhwd mm2, mm3 ; mm2=(60 61 70 71) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + paddw mm5, mm7 ; mm5=tmp10 + paddw mm0, mm4 ; mm0=tmp11 + psubw mm1, mm7 ; mm1=tmp13 + psubw mm6, mm4 ; mm6=tmp12 + + movq mm7, mm5 + paddw mm5, mm0 ; mm5=tmp10+tmp11 + psubw mm7, mm0 ; mm7=tmp10-tmp11 + + paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] + paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] + psraw mm5, PASS1_BITS ; mm5=data0 + psraw mm7, PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4, mm1 ; mm1=tmp13 + movq mm0, mm1 + punpcklwd mm4, mm6 ; mm6=tmp12 + punpckhwd mm0, mm6 + movq mm1, mm4 + movq mm6, mm0 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4, DESCALE_P2 + psrad mm0, DESCALE_P2 + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1, DESCALE_P2 + psrad mm6, DESCALE_P2 + + packssdw mm4, mm0 ; mm4=data2 + packssdw mm1, mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0, mm2 ; mm2=tmp4 + movq mm6, mm3 ; mm3=tmp5 + paddw mm0, mm5 ; mm0=z3 + paddw mm6, mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4, mm0 + movq mm1, mm0 + punpcklwd mm4, mm6 + punpckhwd mm1, mm6 + movq mm0, mm4 + movq mm6, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4, mm2 + movq mm1, mm2 + punpcklwd mm4, mm7 + punpckhwd mm1, mm7 + movq mm2, mm4 + movq mm7, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2, mm0 ; mm2=data1L + paddd mm7, mm6 ; mm7=data1H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4, DESCALE_P2 + psrad mm1, DESCALE_P2 + paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm2, DESCALE_P2 + psrad mm7, DESCALE_P2 + + packssdw mm4, mm1 ; mm4=data7 + packssdw mm2, mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1, mm3 + movq mm7, mm3 + punpcklwd mm1, mm5 + punpckhwd mm7, mm5 + movq mm3, mm1 + movq mm5, mm7 + pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1, mm0 ; mm1=data5L + paddd mm7, mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1, DESCALE_P2 + psrad mm7, DESCALE_P2 + paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm3, DESCALE_P2 + psrad mm5, DESCALE_P2 + + packssdw mm1, mm7 ; mm1=data5 + packssdw mm3, mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm new file mode 100644 index 0000000000..6f8e18cb9d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm @@ -0,0 +1,633 @@ +; +; jfdctint.asm - accurate integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm new file mode 100644 index 0000000000..87951910d8 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm @@ -0,0 +1,451 @@ +; +; jidctflt.asm - floating-point IDCT (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_3dnow) + +EXTN(jconst_idct_float_3dnow): + +PD_1_414 times 2 dd 1.414213562373095048801689 +PD_1_847 times 2 dd 1.847759065022573512256366 +PD_1_082 times 2 dd 1.082392200292393968799446 +PD_2_613 times 2 dd 2.613125929752753055713286 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_3dnow) + +EXTN(jsimd_idct_float_3dnow): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/2 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + pushpic ebx ; save GOT address + mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + or eax, ebx + poppic ebx ; restore GOT address + jnz short .columnDCT + + ; -- AC terms all zero + + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0, mm0 + psrad mm0, (DWORD_BIT-WORD_BIT) + pi2fd mm0, mm0 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm1, mm0 + punpckldq mm0, mm0 + punpckhdq mm1, mm1 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0, mm0 + punpcklwd mm1, mm1 + psrad mm0, (DWORD_BIT-WORD_BIT) + psrad mm1, (DWORD_BIT-WORD_BIT) + pi2fd mm0, mm0 + pi2fd mm1, mm1 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm2, mm2 + punpcklwd mm3, mm3 + psrad mm2, (DWORD_BIT-WORD_BIT) + psrad mm3, (DWORD_BIT-WORD_BIT) + pi2fd mm2, mm2 + pi2fd mm3, mm3 + + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm1 + pfsub mm0, mm2 ; mm0=tmp11 + pfsub mm1, mm3 + pfadd mm4, mm2 ; mm4=tmp10 + pfadd mm5, mm3 ; mm5=tmp13 + + pfmul mm1, [GOTOFF(ebx,PD_1_414)] + pfsub mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm5 ; mm4=tmp3 + pfsub mm0, mm1 ; mm0=tmp2 + pfadd mm6, mm5 ; mm6=tmp0 + pfadd mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm2, mm2 + punpcklwd mm3, mm3 + psrad mm2, (DWORD_BIT-WORD_BIT) + psrad mm3, (DWORD_BIT-WORD_BIT) + pi2fd mm2, mm2 + pi2fd mm3, mm3 + + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm5, mm5 + punpcklwd mm1, mm1 + psrad mm5, (DWORD_BIT-WORD_BIT) + psrad mm1, (DWORD_BIT-WORD_BIT) + pi2fd mm5, mm5 + pi2fd mm1, mm1 + + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4, mm2 + movq mm0, mm5 + pfadd mm2, mm1 ; mm2=z11 + pfadd mm5, mm3 ; mm5=z13 + pfsub mm4, mm1 ; mm4=z12 + pfsub mm0, mm3 ; mm0=z10 + + movq mm1, mm2 + pfsub mm2, mm5 + pfadd mm1, mm5 ; mm1=tmp7 + + pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3, mm0 + pfadd mm0, mm4 + pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3, mm0 ; mm3=tmp12 + pfsub mm4, mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3, mm1 ; mm3=tmp6 + movq mm5, mm6 + movq mm0, mm7 + pfadd mm6, mm1 ; mm6=data0=(00 01) + pfadd mm7, mm3 ; mm7=data1=(10 11) + pfsub mm5, mm1 ; mm5=data7=(70 71) + pfsub mm0, mm3 ; mm0=data6=(60 61) + pfsub mm2, mm3 ; mm2=tmp5 + + movq mm1, mm6 ; transpose coefficients + punpckldq mm6, mm7 ; mm6=(00 10) + punpckhdq mm1, mm7 ; mm1=(01 11) + movq mm3, mm0 ; transpose coefficients + punpckldq mm0, mm5 ; mm0=(60 70) + punpckhdq mm3, mm5 ; mm3=(61 71) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 + + pfadd mm4, mm2 ; mm4=tmp4 + movq mm6, mm7 + movq mm1, mm5 + pfadd mm7, mm2 ; mm7=data2=(20 21) + pfadd mm5, mm4 ; mm5=data4=(40 41) + pfsub mm6, mm2 ; mm6=data5=(50 51) + pfsub mm1, mm4 ; mm1=data3=(30 31) + + movq mm0, mm7 ; transpose coefficients + punpckldq mm7, mm1 ; mm7=(20 30) + punpckhdq mm0, mm1 ; mm0=(21 31) + movq mm3, mm5 ; transpose coefficients + punpckldq mm5, mm6 ; mm5=(40 50) + punpckhdq mm3, mm6 ; mm3=(41 51) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 + +.nextcolumn: + add esi, byte 2*SIZEOF_JCOEF ; coef_block + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/2 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4, mm0 + movq mm5, mm1 + pfsub mm0, mm2 ; mm0=tmp11 + pfsub mm1, mm3 + pfadd mm4, mm2 ; mm4=tmp10 + pfadd mm5, mm3 ; mm5=tmp13 + + pfmul mm1, [GOTOFF(ebx,PD_1_414)] + pfsub mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm5 ; mm4=tmp3 + pfsub mm0, mm1 ; mm0=tmp2 + pfadd mm6, mm5 ; mm6=tmp0 + pfadd mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4, mm2 + movq mm0, mm5 + pfadd mm2, mm1 ; mm2=z11 + pfadd mm5, mm3 ; mm5=z13 + pfsub mm4, mm1 ; mm4=z12 + pfsub mm0, mm3 ; mm0=z10 + + movq mm1, mm2 + pfsub mm2, mm5 + pfadd mm1, mm5 ; mm1=tmp7 + + pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3, mm0 + pfadd mm0, mm4 + pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3, mm0 ; mm3=tmp12 + pfsub mm4, mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3, mm1 ; mm3=tmp6 + movq mm5, mm6 + movq mm0, mm7 + pfadd mm6, mm1 ; mm6=data0=(00 10) + pfadd mm7, mm3 ; mm7=data1=(01 11) + pfsub mm5, mm1 ; mm5=data7=(07 17) + pfsub mm0, mm3 ; mm0=data6=(06 16) + pfsub mm2, mm3 ; mm2=tmp5 + + movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] + pcmpeqd mm3, mm3 + psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) + pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) + pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) + pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) + + pand mm6, mm3 ; mm6=(00 -- 10 --) + pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11) + pand mm0, mm3 ; mm0=(06 -- 16 --) + pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17) + por mm6, mm7 ; mm6=(00 01 10 11) + por mm0, mm5 ; mm0=(06 07 16 17) + + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 + + pfadd mm4, mm2 ; mm4=tmp4 + movq mm7, mm1 + movq mm5, mm3 + pfadd mm1, mm2 ; mm1=data2=(02 12) + pfadd mm3, mm4 ; mm3=data4=(04 14) + pfsub mm7, mm2 ; mm7=data5=(05 15) + pfsub mm5, mm4 ; mm5=data3=(03 13) + + movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] + pcmpeqd mm4, mm4 + psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) + pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) + pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) + pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) + + pand mm3, mm4 ; mm3=(04 -- 14 --) + pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15) + pand mm1, mm4 ; mm1=(02 -- 12 --) + pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13) + por mm3, mm7 ; mm3=(04 05 14 15) + por mm1, mm5 ; mm1=(02 03 12 13) + + movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] + + packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15) + packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17) + paddb mm6, mm2 + paddb mm1, mm2 + + movq mm4, mm6 ; transpose coefficients(phase 2) + punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17) + + movq mm7, mm6 ; transpose coefficients(phase 3) + punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + + poppic ebx ; restore GOT address + + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 2*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm new file mode 100644 index 0000000000..b27ecfdf46 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm @@ -0,0 +1,571 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse) + +EXTN(jconst_idct_float_sse): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_0_125 times 4 dd 0.125 ; 1/8 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse) + +EXTN(jsimd_idct_float_sse): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm1, mm0 ; mm1=(** 02 ** 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **) + cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) + movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm4, mm0 ; mm4=(** 02 ** 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm5, mm1 ; mm5=(** 22 ** 23) + punpcklwd mm1, mm1 ; mm1=(20 20 21 21) + + psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **) + cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) + psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) + cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **) + cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **) + + punpckhwd mm6, mm2 ; mm6=(** 42 ** 43) + punpcklwd mm2, mm2 ; mm2=(40 40 41 41) + punpckhwd mm7, mm3 ; mm7=(** 62 ** 63) + punpcklwd mm3, mm3 ; mm3=(60 60 61 61) + + psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) + psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) + cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **) + cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **) + psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) + psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) + cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **) + cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **) + + movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03) + movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23) + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43) + movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63) + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm6, mm4 ; mm6=(** 12 ** 13) + punpcklwd mm4, mm4 ; mm4=(10 10 11 11) + punpckhwd mm2, mm0 ; mm2=(** 32 ** 33) + punpcklwd mm0, mm0 ; mm0=(30 30 31 31) + + psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) + psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) + cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **) + cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **) + psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) + cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **) + cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **) + + punpckhwd mm7, mm5 ; mm7=(** 52 ** 53) + punpcklwd mm5, mm5 ; mm5=(50 50 51 51) + punpckhwd mm3, mm1 ; mm3=(** 72 ** 73) + punpcklwd mm1, mm1 ; mm1=(70 70 71 71) + + movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13) + movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33) + + psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) + psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) + cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **) + cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **) + psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) + cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **) + cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53) + movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73) + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] + + mulps xmm6, xmm1 ; descale(1/8) + mulps xmm7, xmm1 ; descale(1/8) + mulps xmm5, xmm1 ; descale(1/8) + mulps xmm0, xmm1 ; descale(1/8) + + movhlps xmm3, xmm6 + movhlps xmm1, xmm7 + cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10) + cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11) + cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30) + cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31) + packssdw mm0, mm2 ; mm0=data0=(00 10 20 30) + packssdw mm1, mm3 ; mm1=data1=(01 11 21 31) + + movhlps xmm6, xmm5 + movhlps xmm7, xmm0 + cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17) + cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16) + cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37) + cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36) + packssdw mm4, mm6 ; mm4=data7=(07 17 27 37) + packssdw mm5, mm7 ; mm5=data6=(06 16 26 36) + + packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36) + packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37) + + movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 + movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm5, xmm3 + movaps xmm0, xmm1 + addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34) + subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35) + subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33) + + mulps xmm3, xmm6 ; descale(1/8) + mulps xmm1, xmm6 ; descale(1/8) + mulps xmm5, xmm6 ; descale(1/8) + mulps xmm0, xmm6 ; descale(1/8) + + movhlps xmm7, xmm3 + movhlps xmm2, xmm1 + cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12) + cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14) + cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32) + cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34) + packssdw mm2, mm6 ; mm2=data2=(02 12 22 32) + packssdw mm3, mm7 ; mm3=data4=(04 14 24 34) + + movhlps xmm4, xmm5 + movhlps xmm6, xmm0 + cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15) + cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13) + cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35) + cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33) + packssdw mm5, mm6 ; mm5=data5=(05 15 25 35) + packssdw mm4, mm7 ; mm4=data3=(03 13 23 33) + + movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34) + packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0, mm6 + paddb mm1, mm6 + paddb mm2, mm6 + paddb mm4, mm6 + + movq mm7, mm0 ; transpose coefficients(phase 1) + punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37) + movq mm3, mm2 ; transpose coefficients(phase 1) + punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33) + punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35) + + movq mm5, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33) + movq mm6, mm3 ; transpose coefficients(phase 2) + punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17) + punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37) + + movq mm1, mm0 ; transpose coefficients(phase 3) + punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17) + movq mm4, mm5 ; transpose coefficients(phase 3) + punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm new file mode 100644 index 0000000000..c646eaef76 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm @@ -0,0 +1,497 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm new file mode 100644 index 0000000000..24622d4369 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm @@ -0,0 +1,499 @@ +; +; jidctfst.asm - fast integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_mmx) + +EXTN(jconst_idct_ifast_mmx): + +PW_F1414 times 4 dw F_1_414 << CONST_SHIFT +PW_F1847 times 4 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 4 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_mmx) + +EXTN(jsimd_idct_ifast_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm1 + psubw mm0, mm2 ; mm0=tmp11 + psubw mm1, mm3 + paddw mm4, mm2 ; mm4=tmp10 + paddw mm5, mm3 ; mm5=tmp13 + + psllw mm1, PRE_MULTIPLY_SCALE_BITS + pmulhw mm1, [GOTOFF(ebx,PW_F1414)] + psubw mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + psubw mm4, mm5 ; mm4=tmp3 + psubw mm0, mm1 ; mm0=tmp2 + paddw mm6, mm5 ; mm6=tmp0 + paddw mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4, mm2 + movq mm0, mm5 + psubw mm2, mm1 ; mm2=z12 + psubw mm5, mm3 ; mm5=z10 + paddw mm4, mm1 ; mm4=z11 + paddw mm0, mm3 ; mm0=z13 + + movq mm1, mm5 ; mm1=z10(unscaled) + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm5, PRE_MULTIPLY_SCALE_BITS + + movq mm3, mm4 + psubw mm4, mm0 + paddw mm3, mm0 ; mm3=tmp7 + + psllw mm4, PRE_MULTIPLY_SCALE_BITS + pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0, mm5 + paddw mm5, mm2 + pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw mm2, [GOTOFF(ebx,PW_F1082)] + psubw mm0, mm1 + psubw mm2, mm5 ; mm2=tmp10 + paddw mm0, mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0, mm3 ; mm0=tmp6 + movq mm1, mm6 + movq mm5, mm7 + paddw mm6, mm3 ; mm6=data0=(00 01 02 03) + paddw mm7, mm0 ; mm7=data1=(10 11 12 13) + psubw mm1, mm3 ; mm1=data7=(70 71 72 73) + psubw mm5, mm0 ; mm5=data6=(60 61 62 63) + psubw mm4, mm0 ; mm4=tmp5 + + movq mm3, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm3, mm7 ; mm3=(02 12 03 13) + movq mm0, mm5 ; transpose coefficients(phase 1) + punpcklwd mm5, mm1 ; mm5=(60 70 61 71) + punpckhwd mm0, mm1 ; mm0=(62 72 63 73) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm1, MMWORD [wk(1)] ; mm1=tmp3 + + movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) + movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) + + paddw mm2, mm4 ; mm2=tmp4 + movq mm5, mm7 + movq mm0, mm1 + paddw mm7, mm4 ; mm7=data2=(20 21 22 23) + paddw mm1, mm2 ; mm1=data4=(40 41 42 43) + psubw mm5, mm4 ; mm5=data5=(50 51 52 53) + psubw mm0, mm2 ; mm0=data3=(30 31 32 33) + + movq mm4, mm7 ; transpose coefficients(phase 1) + punpcklwd mm7, mm0 ; mm7=(20 30 21 31) + punpckhwd mm4, mm0 ; mm4=(22 32 23 33) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm5 ; mm1=(40 50 41 51) + punpckhwd mm2, mm5 ; mm2=(42 52 43 53) + + movq mm0, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm7 ; mm6=(00 10 20 30) + punpckhdq mm0, mm7 ; mm0=(01 11 21 31) + movq mm5, mm3 ; transpose coefficients(phase 2) + punpckldq mm3, mm4 ; mm3=(02 12 22 32) + punpckhdq mm5, mm4 ; mm5=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) + movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm7 ; mm1=(40 50 60 70) + punpckhdq mm6, mm7 ; mm6=(41 51 61 71) + movq mm0, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm4 ; mm2=(42 52 62 72) + punpckhdq mm0, mm4 ; mm0=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm0 + movq mm5, mm1 + psubw mm0, mm2 ; mm0=tmp11 + psubw mm1, mm3 + paddw mm4, mm2 ; mm4=tmp10 + paddw mm5, mm3 ; mm5=tmp13 + + psllw mm1, PRE_MULTIPLY_SCALE_BITS + pmulhw mm1, [GOTOFF(ebx,PW_F1414)] + psubw mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + psubw mm4, mm5 ; mm4=tmp3 + psubw mm0, mm1 ; mm0=tmp2 + paddw mm6, mm5 ; mm6=tmp0 + paddw mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm2 + movq mm0, mm5 + psubw mm2, mm1 ; mm2=z12 + psubw mm5, mm3 ; mm5=z10 + paddw mm4, mm1 ; mm4=z11 + paddw mm0, mm3 ; mm0=z13 + + movq mm1, mm5 ; mm1=z10(unscaled) + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm5, PRE_MULTIPLY_SCALE_BITS + + movq mm3, mm4 + psubw mm4, mm0 + paddw mm3, mm0 ; mm3=tmp7 + + psllw mm4, PRE_MULTIPLY_SCALE_BITS + pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0, mm5 + paddw mm5, mm2 + pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw mm2, [GOTOFF(ebx,PW_F1082)] + psubw mm0, mm1 + psubw mm2, mm5 ; mm2=tmp10 + paddw mm0, mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0, mm3 ; mm0=tmp6 + movq mm1, mm6 + movq mm5, mm7 + paddw mm6, mm3 ; mm6=data0=(00 10 20 30) + paddw mm7, mm0 ; mm7=data1=(01 11 21 31) + psraw mm6, (PASS1_BITS+3) ; descale + psraw mm7, (PASS1_BITS+3) ; descale + psubw mm1, mm3 ; mm1=data7=(07 17 27 37) + psubw mm5, mm0 ; mm5=data6=(06 16 26 36) + psraw mm1, (PASS1_BITS+3) ; descale + psraw mm5, (PASS1_BITS+3) ; descale + psubw mm4, mm0 ; mm4=tmp5 + + packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36) + packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37) + + movq mm3, MMWORD [wk(0)] ; mm3=tmp2 + movq mm0, MMWORD [wk(1)] ; mm0=tmp3 + + paddw mm2, mm4 ; mm2=tmp4 + movq mm5, mm3 + movq mm1, mm0 + paddw mm3, mm4 ; mm3=data2=(02 12 22 32) + paddw mm0, mm2 ; mm0=data4=(04 14 24 34) + psraw mm3, (PASS1_BITS+3) ; descale + psraw mm0, (PASS1_BITS+3) ; descale + psubw mm5, mm4 ; mm5=data5=(05 15 25 35) + psubw mm1, mm2 ; mm1=data3=(03 13 23 33) + psraw mm5, (PASS1_BITS+3) ; descale + psraw mm1, (PASS1_BITS+3) ; descale + + movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] + + packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34) + packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35) + + paddb mm6, mm4 + paddb mm7, mm4 + paddb mm3, mm4 + paddb mm1, mm4 + + movq mm2, mm6 ; transpose coefficients(phase 1) + punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31) + punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37) + movq mm0, mm3 ; transpose coefficients(phase 1) + punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33) + punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35) + + movq mm5, mm6 ; transpose coefficients(phase 2) + punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33) + movq mm4, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17) + punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37) + + movq mm7, mm6 ; transpose coefficients(phase 3) + punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17) + movq mm1, mm5 ; transpose coefficients(phase 3) + punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm new file mode 100644 index 0000000000..19704ffa48 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm @@ -0,0 +1,501 @@ +; +; jidctfst.asm - fast integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [GOTOFF(ebx,PW_F1414)] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2, [GOTOFF(ebx,PW_F1082)] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F1414)] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0, [GOTOFF(ebx,PW_F1082)] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm new file mode 100644 index 0000000000..199c7df3b6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm @@ -0,0 +1,453 @@ +; +; jidctint.asm - accurate integer IDCT (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) + ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76) + + vpermq %5, %1, 0xD8 + vpermq %6, %2, 0x72 + vpermq %7, %3, 0xD8 + vpermq %8, %4, 0x72 + ; transpose coefficients(phase 1) + ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpunpcklwd %1, %5, %6 + vpunpckhwd %2, %5, %6 + vpunpcklwd %3, %7, %8 + vpunpckhwd %4, %7, %8 + ; transpose coefficients(phase 2) + ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72) + ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73) + ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76) + ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77) + + vpunpcklwd %5, %1, %2 + vpunpcklwd %6, %3, %4 + vpunpckhwd %7, %1, %2 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 3) + ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53) + ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57) + ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73) + ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77) + + vpunpcklqdq %1, %5, %6 + vpunpckhqdq %2, %5, %6 + vpunpcklqdq %3, %7, %8 + vpunpckhqdq %4, %7, %8 + ; transpose coefficients(phase 4) + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%12: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 13 + ; -- Even part + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2 + vpunpcklwd %5, %3, %6 ; %5=in26_62L + vpunpckhwd %6, %3, %6 ; %6=in26_62H + vpmaddwd %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %5=tmp3_2L + vpmaddwd %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %6=tmp3_2H + + vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0 + vpsignw %1, %1, [GOTOFF(ebx,PW_1_NEG1)] + vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4) + + vpxor %1, %1, %1 + vpunpcklwd %8, %1, %7 ; %8=tmp0_1L + vpunpckhwd %1, %1, %7 ; %1=tmp0_1H + vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS + vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS + + vpsubd %3, %8, %5 + vmovdqu %11, %3 ; %11=tmp0_1L-tmp3_2L=tmp13_12L + vpaddd %3, %8, %5 + vmovdqu %9, %3 ; %9=tmp0_1L+tmp3_2L=tmp10_11L + vpsubd %3, %1, %6 + vmovdqu %12, %3 ; %12=tmp0_1H-tmp3_2H=tmp13_12H + vpaddd %3, %1, %6 + vmovdqu %10, %3 ; %10=tmp0_1H+tmp3_2H=tmp10_11H + + ; -- Odd part + + vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3 + vpunpcklwd %7, %1, %8 ; %7=z34_43L + vpunpckhwd %8, %1, %8 ; %8=z34_43H + vpmaddwd %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %7=z3_4L + vpmaddwd %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %8=z3_4H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3 + vpunpcklwd %3, %4, %2 ; %3=in71_53L + vpunpckhwd %4, %4, %2 ; %4=in71_53H + + vpmaddwd %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %5=tmp0_1L + vpmaddwd %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %6=tmp0_1H + vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L + vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H + + vpmaddwd %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %3=tmp3_2L + vpmaddwd %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %4=tmp3_2H + vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L + vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H + vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L + vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H + + ; -- Final output stage + + vmovdqu %3, %9 + vmovdqu %4, %10 + + vpaddd %1, %3, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L + vpaddd %2, %4, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H + vpaddd %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %1, %1, DESCALE_P %+ %13 + vpsrad %2, %2, DESCALE_P %+ %13 + vpackssdw %1, %1, %2 ; %1=data0_1 + + vpsubd %3, %3, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L + vpsubd %4, %4, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %4, %4, DESCALE_P %+ %13 + vpackssdw %4, %3, %4 ; %4=data7_6 + + vmovdqu %7, %11 + vmovdqu %8, %12 + + vpaddd %2, %7, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L + vpaddd %3, %8, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H + vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %2, %2, DESCALE_P %+ %13 + vpsrad %3, %3, DESCALE_P %+ %13 + vpackssdw %2, %2, %3 ; %2=data3_2 + + vpsubd %3, %7, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L + vpsubd %6, %8, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %6, %6, DESCALE_P %+ %13 + vpackssdw %3, %3, %6 ; %3=data4_5 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_avx2) + +EXTN(jconst_idct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899) + times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 32 db CENTERJSAMPLE +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_avx2) + +EXTN(jsimd_idct_islow_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, xmm0 + vpacksswb xmm1, xmm1, xmm1 + vpacksswb xmm1, xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + vpsllw xmm5, xmm5, PASS1_BITS + + vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03) + vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07) + vinserti128 ymm4, ymm4, xmm5, 1 + + vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04) + vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05) + vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06) + vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) + + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)] ; ymm4=in0_1 + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)] ; ymm5=in2_3 + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)] ; ymm6=in4_5 + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)] ; ymm7=in6_7 + vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4 + vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1 + vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 + vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 + +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 + vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 + + vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 + vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67 + vpaddb ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)] + vpaddb ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)] + + vextracti128 xmm6, ymm1, 1 ; xmm3=data67 + vextracti128 xmm4, ymm0, 1 ; xmm2=data45 + vextracti128 xmm2, ymm1, 0 ; xmm1=data23 + vextracti128 xmm0, ymm0, 0 ; xmm0=data01 + + vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + vzeroupper + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + mov edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm new file mode 100644 index 0000000000..f15c8d34bc --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm @@ -0,0 +1,851 @@ +; +; jidctint.asm - accurate integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_mmx) + +EXTN(jconst_idct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 12 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_mmx) + +EXTN(jsimd_idct_islow_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0, PASS1_BITS + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4, mm1 ; mm1=in2=z2 + movq mm5, mm1 + punpcklwd mm4, mm3 ; mm3=in6=z3 + punpckhwd mm5, mm3 + movq mm1, mm4 + movq mm3, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6, mm0 + paddw mm0, mm2 ; mm0=in0+in4 + psubw mm6, mm2 ; mm6=in0-in4 + + pxor mm7, mm7 + pxor mm2, mm2 + punpcklwd mm7, mm0 ; mm7=tmp0L + punpckhwd mm2, mm0 ; mm2=tmp0H + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0, mm7 + paddd mm7, mm4 ; mm7=tmp10L + psubd mm0, mm4 ; mm0=tmp13L + movq mm4, mm2 + paddd mm2, mm5 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5, mm5 + pxor mm7, mm7 + punpcklwd mm5, mm6 ; mm5=tmp1L + punpckhwd mm7, mm6 ; mm7=tmp1H + psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2, mm5 + paddd mm5, mm1 ; mm5=tmp11L + psubd mm2, mm1 ; mm2=tmp12L + movq mm0, mm7 + paddd mm7, mm3 ; mm7=tmp11H + psubd mm0, mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm5, mm6 + movq mm7, mm4 + paddw mm5, mm3 ; mm5=z3 + paddw mm7, mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2, mm5 + movq mm0, mm5 + punpcklwd mm2, mm7 + punpckhwd mm0, mm7 + movq mm5, mm2 + movq mm7, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2, mm3 + movq mm0, mm3 + punpcklwd mm2, mm4 + punpckhwd mm0, mm4 + movq mm3, mm2 + movq mm4, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3, mm5 ; mm3=tmp3L + paddd mm4, mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2, mm1 + movq mm0, mm1 + punpcklwd mm2, mm6 + punpckhwd mm0, mm6 + movq mm1, mm2 + movq mm6, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2, mm5 ; mm2=tmp1L + paddd mm0, mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2, mm5 + movq mm0, mm7 + paddd mm5, mm3 ; mm5=data0L + paddd mm7, mm4 ; mm7=data0H + psubd mm2, mm3 ; mm2=data7L + psubd mm0, mm4 ; mm0=data7H + + movq mm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] + + paddd mm5, mm3 + paddd mm7, mm3 + psrad mm5, DESCALE_P1 + psrad mm7, DESCALE_P1 + paddd mm2, mm3 + paddd mm0, mm3 + psrad mm2, DESCALE_P1 + psrad mm0, DESCALE_P1 + + packssdw mm5, mm7 ; mm5=data0=(00 01 02 03) + packssdw mm2, mm0 ; mm2=data7=(70 71 72 73) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7, mm4 + movq mm0, mm3 + paddd mm4, mm1 ; mm4=data1L + paddd mm3, mm6 ; mm3=data1H + psubd mm7, mm1 ; mm7=data6L + psubd mm0, mm6 ; mm0=data6H + + movq mm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] + + paddd mm4, mm1 + paddd mm3, mm1 + psrad mm4, DESCALE_P1 + psrad mm3, DESCALE_P1 + paddd mm7, mm1 + paddd mm0, mm1 + psrad mm7, DESCALE_P1 + psrad mm0, DESCALE_P1 + + packssdw mm4, mm3 ; mm4=data1=(10 11 12 13) + packssdw mm7, mm0 ; mm7=data6=(60 61 62 63) + + movq mm6, mm5 ; transpose coefficients(phase 1) + punpcklwd mm5, mm4 ; mm5=(00 10 01 11) + punpckhwd mm6, mm4 ; mm6=(02 12 03 13) + movq mm1, mm7 ; transpose coefficients(phase 1) + punpcklwd mm7, mm2 ; mm7=(60 70 61 71) + punpckhwd mm1, mm2 ; mm1=(62 72 63 73) + + movq mm3, MMWORD [wk(6)] ; mm3=tmp12L + movq mm0, MMWORD [wk(7)] ; mm0=tmp12H + movq mm4, MMWORD [wk(10)] ; mm4=tmp1L + movq mm2, MMWORD [wk(11)] ; mm2=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) + movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) + movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) + movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) + + movq mm5, mm3 + movq mm6, mm0 + paddd mm3, mm4 ; mm3=data2L + paddd mm0, mm2 ; mm0=data2H + psubd mm5, mm4 ; mm5=data5L + psubd mm6, mm2 ; mm6=data5H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] + + paddd mm3, mm7 + paddd mm0, mm7 + psrad mm3, DESCALE_P1 + psrad mm0, DESCALE_P1 + paddd mm5, mm7 + paddd mm6, mm7 + psrad mm5, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm3, mm0 ; mm3=data2=(20 21 22 23) + packssdw mm5, mm6 ; mm5=data5=(50 51 52 53) + + movq mm1, MMWORD [wk(2)] ; mm1=tmp13L + movq mm4, MMWORD [wk(3)] ; mm4=tmp13H + movq mm2, MMWORD [wk(8)] ; mm2=tmp0L + movq mm7, MMWORD [wk(9)] ; mm7=tmp0H + + movq mm0, mm1 + movq mm6, mm4 + paddd mm1, mm2 ; mm1=data3L + paddd mm4, mm7 ; mm4=data3H + psubd mm0, mm2 ; mm0=data4L + psubd mm6, mm7 ; mm6=data4H + + movq mm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] + + paddd mm1, mm2 + paddd mm4, mm2 + psrad mm1, DESCALE_P1 + psrad mm4, DESCALE_P1 + paddd mm0, mm2 + paddd mm6, mm2 + psrad mm0, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm1, mm4 ; mm1=data3=(30 31 32 33) + packssdw mm0, mm6 ; mm0=data4=(40 41 42 43) + + movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) + movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) + + movq mm4, mm3 ; transpose coefficients(phase 1) + punpcklwd mm3, mm1 ; mm3=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm6, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm5 ; mm0=(40 50 41 51) + punpckhwd mm6, mm5 ; mm6=(42 52 43 53) + + movq mm1, mm7 ; transpose coefficients(phase 2) + punpckldq mm7, mm3 ; mm7=(00 10 20 30) + punpckhdq mm1, mm3 ; mm1=(01 11 21 31) + movq mm5, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm4 ; mm2=(02 12 22 32) + punpckhdq mm5, mm4 ; mm5=(03 13 23 33) + + movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) + movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm7, mm0 ; transpose coefficients(phase 2) + punpckldq mm0, mm3 ; mm0=(40 50 60 70) + punpckhdq mm7, mm3 ; mm7=(41 51 61 71) + movq mm1, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm4 ; mm6=(42 52 62 72) + punpckhdq mm1, mm4 ; mm1=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4, mm1 ; mm1=in2=z2 + movq mm5, mm1 + punpcklwd mm4, mm3 ; mm3=in6=z3 + punpckhwd mm5, mm3 + movq mm1, mm4 + movq mm3, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6, mm0 + paddw mm0, mm2 ; mm0=in0+in4 + psubw mm6, mm2 ; mm6=in0-in4 + + pxor mm7, mm7 + pxor mm2, mm2 + punpcklwd mm7, mm0 ; mm7=tmp0L + punpckhwd mm2, mm0 ; mm2=tmp0H + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0, mm7 + paddd mm7, mm4 ; mm7=tmp10L + psubd mm0, mm4 ; mm0=tmp13L + movq mm4, mm2 + paddd mm2, mm5 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5, mm5 + pxor mm7, mm7 + punpcklwd mm5, mm6 ; mm5=tmp1L + punpckhwd mm7, mm6 ; mm7=tmp1H + psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2, mm5 + paddd mm5, mm1 ; mm5=tmp11L + psubd mm2, mm1 ; mm2=tmp12L + movq mm0, mm7 + paddd mm7, mm3 ; mm7=tmp11H + psubd mm0, mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm5, mm6 + movq mm7, mm4 + paddw mm5, mm3 ; mm5=z3 + paddw mm7, mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2, mm5 + movq mm0, mm5 + punpcklwd mm2, mm7 + punpckhwd mm0, mm7 + movq mm5, mm2 + movq mm7, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2, mm3 + movq mm0, mm3 + punpcklwd mm2, mm4 + punpckhwd mm0, mm4 + movq mm3, mm2 + movq mm4, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3, mm5 ; mm3=tmp3L + paddd mm4, mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2, mm1 + movq mm0, mm1 + punpcklwd mm2, mm6 + punpckhwd mm0, mm6 + movq mm1, mm2 + movq mm6, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2, mm5 ; mm2=tmp1L + paddd mm0, mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2, mm5 + movq mm0, mm7 + paddd mm5, mm3 ; mm5=data0L + paddd mm7, mm4 ; mm7=data0H + psubd mm2, mm3 ; mm2=data7L + psubd mm0, mm4 ; mm0=data7H + + movq mm3, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] + + paddd mm5, mm3 + paddd mm7, mm3 + psrad mm5, DESCALE_P2 + psrad mm7, DESCALE_P2 + paddd mm2, mm3 + paddd mm0, mm3 + psrad mm2, DESCALE_P2 + psrad mm0, DESCALE_P2 + + packssdw mm5, mm7 ; mm5=data0=(00 10 20 30) + packssdw mm2, mm0 ; mm2=data7=(07 17 27 37) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7, mm4 + movq mm0, mm3 + paddd mm4, mm1 ; mm4=data1L + paddd mm3, mm6 ; mm3=data1H + psubd mm7, mm1 ; mm7=data6L + psubd mm0, mm6 ; mm0=data6H + + movq mm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] + + paddd mm4, mm1 + paddd mm3, mm1 + psrad mm4, DESCALE_P2 + psrad mm3, DESCALE_P2 + paddd mm7, mm1 + paddd mm0, mm1 + psrad mm7, DESCALE_P2 + psrad mm0, DESCALE_P2 + + packssdw mm4, mm3 ; mm4=data1=(01 11 21 31) + packssdw mm7, mm0 ; mm7=data6=(06 16 26 36) + + packsswb mm5, mm7 ; mm5=(00 10 20 30 06 16 26 36) + packsswb mm4, mm2 ; mm4=(01 11 21 31 07 17 27 37) + + movq mm6, MMWORD [wk(6)] ; mm6=tmp12L + movq mm1, MMWORD [wk(7)] ; mm1=tmp12H + movq mm3, MMWORD [wk(10)] ; mm3=tmp1L + movq mm0, MMWORD [wk(11)] ; mm0=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) + movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) + + movq mm7, mm6 + movq mm2, mm1 + paddd mm6, mm3 ; mm6=data2L + paddd mm1, mm0 ; mm1=data2H + psubd mm7, mm3 ; mm7=data5L + psubd mm2, mm0 ; mm2=data5H + + movq mm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] + + paddd mm6, mm5 + paddd mm1, mm5 + psrad mm6, DESCALE_P2 + psrad mm1, DESCALE_P2 + paddd mm7, mm5 + paddd mm2, mm5 + psrad mm7, DESCALE_P2 + psrad mm2, DESCALE_P2 + + packssdw mm6, mm1 ; mm6=data2=(02 12 22 32) + packssdw mm7, mm2 ; mm7=data5=(05 15 25 35) + + movq mm4, MMWORD [wk(2)] ; mm4=tmp13L + movq mm3, MMWORD [wk(3)] ; mm3=tmp13H + movq mm0, MMWORD [wk(8)] ; mm0=tmp0L + movq mm5, MMWORD [wk(9)] ; mm5=tmp0H + + movq mm1, mm4 + movq mm2, mm3 + paddd mm4, mm0 ; mm4=data3L + paddd mm3, mm5 ; mm3=data3H + psubd mm1, mm0 ; mm1=data4L + psubd mm2, mm5 ; mm2=data4H + + movq mm0, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] + + paddd mm4, mm0 + paddd mm3, mm0 + psrad mm4, DESCALE_P2 + psrad mm3, DESCALE_P2 + paddd mm1, mm0 + paddd mm2, mm0 + psrad mm1, DESCALE_P2 + psrad mm2, DESCALE_P2 + + movq mm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] + + packssdw mm4, mm3 ; mm4=data3=(03 13 23 33) + packssdw mm1, mm2 ; mm1=data4=(04 14 24 34) + + movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) + movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) + + packsswb mm6, mm1 ; mm6=(02 12 22 32 04 14 24 34) + packsswb mm4, mm7 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0, mm5 + paddb mm3, mm5 + paddb mm6, mm5 + paddb mm4, mm5 + + movq mm2, mm0 ; transpose coefficients(phase 1) + punpcklbw mm0, mm3 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm2, mm3 ; mm2=(06 07 16 17 26 27 36 37) + movq mm1, mm6 ; transpose coefficients(phase 1) + punpcklbw mm6, mm4 ; mm6=(02 03 12 13 22 23 32 33) + punpckhbw mm1, mm4 ; mm1=(04 05 14 15 24 25 34 35) + + movq mm7, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm6 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm7, mm6 ; mm7=(20 21 22 23 30 31 32 33) + movq mm5, mm1 ; transpose coefficients(phase 2) + punpcklwd mm1, mm2 ; mm1=(04 05 06 07 14 15 16 17) + punpckhwd mm5, mm2 ; mm5=(24 25 26 27 34 35 36 37) + + movq mm3, mm0 ; transpose coefficients(phase 3) + punpckldq mm0, mm1 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm3, mm1 ; mm3=(10 11 12 13 14 15 16 17) + movq mm4, mm7 ; transpose coefficients(phase 3) + punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27) + punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm new file mode 100644 index 0000000000..43e320189b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm @@ -0,0 +1,858 @@ +; +; jidctint.asm - accurate integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm new file mode 100644 index 0000000000..e2307e1cb6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm @@ -0,0 +1,704 @@ +; +; jidctred.asm - reduced-size IDCT (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_mmx) + +EXTN(jconst_idct_red_mmx): + +PW_F184_MF076 times 2 dw F_1_847, -F_0_765 +PW_F256_F089 times 2 dw F_2_562, F_0_899 +PW_F106_MF217 times 2 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 2 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 2 dw F_1_451, -F_0_211 +PW_F362_MF127 times 2 dw F_3_624, -F_1_272 +PW_F085_MF072 times 2 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_mmx) + +EXTN(jsimd_idct_4x4_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm0, mm1 + packsswb mm0, mm0 + movd eax, mm0 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0, PASS1_BITS + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm0 + punpcklwd mm4, mm1 + punpckhwd mm5, mm1 + movq mm0, mm4 + movq mm1, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6, mm2 + movq mm7, mm2 + punpcklwd mm6, mm3 + punpckhwd mm7, mm3 + movq mm2, mm6 + movq mm3, mm7 + pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6, mm4 ; mm6=tmp2L + paddd mm7, mm5 ; mm7=tmp2H + paddd mm2, mm0 ; mm2=tmp0L + paddd mm3, mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklwd mm1, mm4 ; mm1=tmp0L + punpckhwd mm2, mm4 ; mm2=tmp0H + psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3, mm5 ; mm5=in2=z2 + punpcklwd mm5, mm0 ; mm0=in6=z3 + punpckhwd mm3, mm0 + pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4, mm1 + movq mm0, mm2 + paddd mm1, mm5 ; mm1=tmp10L + paddd mm2, mm3 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp12L + psubd mm0, mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5, mm1 + movq mm3, mm2 + paddd mm1, mm6 ; mm1=data0L + paddd mm2, mm7 ; mm2=data0H + psubd mm5, mm6 ; mm5=data3L + psubd mm3, mm7 ; mm3=data3H + + movq mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] + + paddd mm1, mm6 + paddd mm2, mm6 + psrad mm1, DESCALE_P1_4 + psrad mm2, DESCALE_P1_4 + paddd mm5, mm6 + paddd mm3, mm6 + psrad mm5, DESCALE_P1_4 + psrad mm3, DESCALE_P1_4 + + packssdw mm1, mm2 ; mm1=data0=(00 01 02 03) + packssdw mm5, mm3 ; mm5=data3=(30 31 32 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2, mm4 + movq mm3, mm0 + paddd mm4, mm7 ; mm4=data1L + paddd mm0, mm6 ; mm0=data1H + psubd mm2, mm7 ; mm2=data2L + psubd mm3, mm6 ; mm3=data2H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] + + paddd mm4, mm7 + paddd mm0, mm7 + psrad mm4, DESCALE_P1_4 + psrad mm0, DESCALE_P1_4 + paddd mm2, mm7 + paddd mm3, mm7 + psrad mm2, DESCALE_P1_4 + psrad mm3, DESCALE_P1_4 + + packssdw mm4, mm0 ; mm4=data1=(10 11 12 13) + packssdw mm2, mm3 ; mm2=data2=(20 21 22 23) + + movq mm6, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm4 ; mm1=(00 10 01 11) + punpckhwd mm6, mm4 ; mm6=(02 12 03 13) + movq mm7, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm5 ; mm2=(20 30 21 31) + punpckhwd mm7, mm5 ; mm7=(22 32 23 33) + + movq mm0, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm2 ; mm1=(00 10 20 30) + punpckhdq mm0, mm2 ; mm0=(01 11 21 31) + movq mm3, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm7 ; mm6=(02 12 22 32) + punpckhdq mm3, mm7 ; mm3=(03 13 23 33) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm0 + movq mm5, mm0 + punpcklwd mm4, mm1 + punpckhwd mm5, mm1 + movq mm0, mm4 + movq mm1, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6, mm2 + movq mm7, mm2 + punpcklwd mm6, mm3 + punpckhwd mm7, mm3 + movq mm2, mm6 + movq mm3, mm7 + pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6, mm4 ; mm6=tmp2L + paddd mm7, mm5 ; mm7=tmp2H + paddd mm2, mm0 ; mm2=tmp0L + paddd mm3, mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklwd mm1, mm4 ; mm1=tmp0L + punpckhwd mm2, mm4 ; mm2=tmp0H + psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3, mm5 ; mm5=in2=z2 + punpcklwd mm5, mm0 ; mm0=in6=z3 + punpckhwd mm3, mm0 + pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4, mm1 + movq mm0, mm2 + paddd mm1, mm5 ; mm1=tmp10L + paddd mm2, mm3 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp12L + psubd mm0, mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5, mm1 + movq mm3, mm2 + paddd mm1, mm6 ; mm1=data0L + paddd mm2, mm7 ; mm2=data0H + psubd mm5, mm6 ; mm5=data3L + psubd mm3, mm7 ; mm3=data3H + + movq mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] + + paddd mm1, mm6 + paddd mm2, mm6 + psrad mm1, DESCALE_P2_4 + psrad mm2, DESCALE_P2_4 + paddd mm5, mm6 + paddd mm3, mm6 + psrad mm5, DESCALE_P2_4 + psrad mm3, DESCALE_P2_4 + + packssdw mm1, mm2 ; mm1=data0=(00 10 20 30) + packssdw mm5, mm3 ; mm5=data3=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2, mm4 + movq mm3, mm0 + paddd mm4, mm7 ; mm4=data1L + paddd mm0, mm6 ; mm0=data1H + psubd mm2, mm7 ; mm2=data2L + psubd mm3, mm6 ; mm3=data2H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] + + paddd mm4, mm7 + paddd mm0, mm7 + psrad mm4, DESCALE_P2_4 + psrad mm0, DESCALE_P2_4 + paddd mm2, mm7 + paddd mm3, mm7 + psrad mm2, DESCALE_P2_4 + psrad mm3, DESCALE_P2_4 + + packssdw mm4, mm0 ; mm4=data1=(01 11 21 31) + packssdw mm2, mm3 ; mm2=data2=(02 12 22 32) + + movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm1, mm2 ; mm1=(00 10 20 30 02 12 22 32) + packsswb mm4, mm5 ; mm4=(01 11 21 31 03 13 23 33) + paddb mm1, mm6 + paddb mm4, mm6 + + movq mm7, mm1 ; transpose coefficients(phase 1) + punpcklbw mm1, mm4 ; mm1=(00 01 10 11 20 21 30 31) + punpckhbw mm7, mm4 ; mm7=(02 03 12 13 22 23 32 33) + + movq mm0, mm1 ; transpose coefficients(phase 2) + punpcklwd mm1, mm7 ; mm1=(00 01 02 03 10 11 12 13) + punpckhwd mm0, mm7 ; mm0=(20 21 22 23 30 31 32 33) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 + + psrlq mm1, 4*BYTE_BIT + psrlq mm0, 4*BYTE_BIT + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_mmx) + +EXTN(jsimd_idct_2x2_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) + ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) + + pcmpeqd mm7, mm7 + pslld mm7, WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} + + movq mm4, mm0 ; mm4=(10 11 ** 13) + movq mm5, mm2 ; mm5=(50 51 ** 53) + punpcklwd mm4, mm1 ; mm4=(10 30 11 31) + punpcklwd mm5, mm3 ; mm5=(50 70 51 71) + pmaddwd mm4, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)] + + psrld mm0, WORD_BIT ; mm0=(11 -- 13 --) + pand mm1, mm7 ; mm1=(-- 31 -- 33) + psrld mm2, WORD_BIT ; mm2=(51 -- 53 --) + pand mm3, mm7 ; mm3=(-- 71 -- 73) + por mm0, mm1 ; mm0=(11 31 13 33) + por mm2, mm3 ; mm2=(51 71 53 73) + pmaddwd mm0, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm2, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm4, mm5 ; mm4=tmp0[col0 col1] + + movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] + pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] + pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) + ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) + + psrld mm6, WORD_BIT ; mm6=(15 -- 17 --) + pand mm1, mm7 ; mm1=(-- 35 -- 37) + psrld mm3, WORD_BIT ; mm3=(55 -- 57 --) + pand mm5, mm7 ; mm5=(-- 75 -- 77) + por mm6, mm1 ; mm6=(15 35 17 37) + por mm3, mm5 ; mm3=(55 75 57 77) + pmaddwd mm6, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm3, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm0, mm2 ; mm0=tmp0[col1 col3] + paddd mm6, mm3 ; mm6=tmp0[col5 col7] + + ; -- Even part + + movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) + + movq mm2, mm1 ; mm2=(00 01 ** 03) + pslld mm1, WORD_BIT ; mm1=(-- 00 -- **) + psrad mm1, (WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] + + pand mm2, mm7 ; mm2=(-- 01 -- 03) + pand mm5, mm7 ; mm5=(-- 05 -- 07) + psrad mm2, (WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] + psrad mm5, (WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] + + ; -- Final output stage + + movq mm3, mm1 + paddd mm1, mm4 ; mm1=data0[col0 ****]=(A0 **) + psubd mm3, mm4 ; mm3=data1[col0 ****]=(B0 **) + punpckldq mm1, mm3 ; mm1=(A0 B0) + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] + + movq mm4, mm2 + movq mm3, mm5 + paddd mm2, mm0 ; mm2=data0[col1 col3]=(A1 A3) + paddd mm5, mm6 ; mm5=data0[col5 col7]=(A5 A7) + psubd mm4, mm0 ; mm4=data1[col1 col3]=(B1 B3) + psubd mm3, mm6 ; mm3=data1[col5 col7]=(B5 B7) + + paddd mm1, mm7 + psrad mm1, DESCALE_P1_2 + + paddd mm2, mm7 + paddd mm5, mm7 + psrad mm2, DESCALE_P1_2 + psrad mm5, DESCALE_P1_2 + paddd mm4, mm7 + paddd mm3, mm7 + psrad mm4, DESCALE_P1_2 + psrad mm3, DESCALE_P1_2 + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw mm2, mm4 ; mm2=(A1 A3 B1 B3) + packssdw mm5, mm3 ; mm5=(A5 A7 B5 B7) + pmaddwd mm2, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm2, mm5 ; mm2=tmp0[row0 row1] + + ; -- Even part + + pslld mm1, (CONST_BITS+2) ; mm1=tmp10[row0 row1] + + ; -- Final output stage + + movq mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] + + movq mm6, mm1 + paddd mm1, mm2 ; mm1=data0[row0 row1]=(C0 C1) + psubd mm6, mm2 ; mm6=data1[row0 row1]=(D0 D1) + + paddd mm1, mm0 + paddd mm6, mm0 + psrad mm1, DESCALE_P2_2 + psrad mm6, DESCALE_P2_2 + + movq mm7, mm1 ; transpose coefficients + punpckldq mm1, mm6 ; mm1=(C0 D0) + punpckhdq mm7, mm6 ; mm7=(C1 D1) + + packssdw mm1, mm7 ; mm1=(C0 D0 C1 D1) + packsswb mm1, mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) + paddb mm1, [GOTOFF(ebx,PB_CENTERJSAMP)] + + movd ecx, mm1 + movd ebx, mm1 ; ebx=(C0 D0 C1 D1) + shr ecx, 2*BYTE_BIT ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm new file mode 100644 index 0000000000..6e56494e97 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm @@ -0,0 +1,592 @@ +; +; jidctred.asm - reduced-size IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847, -F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 4 dw F_1_451, -F_0_211 +PW_F362_MF127 times 4 dw F_3_624, -F_1_272 +PW_F085_MF072 times 4 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 + + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)] + + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)] + + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 + + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6, DESCALE_P2_2 + + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)] + + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm new file mode 100644 index 0000000000..5cb60caa94 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm @@ -0,0 +1,230 @@ +; +; jquant.asm - sample data conversion and quantization (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow) + +EXTN(jsimd_convsamp_float_3dnow): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7, mm7 + psllw mm7, 7 + packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0, mm7 ; mm0=(01234567) + psubb mm1, mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4, mm2 ; mm4=(***0***1) + punpckhwd mm2, mm2 ; mm2=(***2***3) + punpcklwd mm5, mm0 ; mm5=(***4***5) + punpckhwd mm0, mm0 ; mm0=(***6***7) + + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) + pi2fd mm4, mm4 + pi2fd mm2, mm2 + psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) + pi2fd mm5, mm5 + pi2fd mm0, mm0 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + + punpcklwd mm6, mm3 ; mm6=(***8***9) + punpckhwd mm3, mm3 ; mm3=(***A***B) + punpcklwd mm4, mm1 ; mm4=(***C***D) + punpckhwd mm1, mm1 ; mm1=(***E***F) + + psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) + pi2fd mm6, mm6 + pi2fd mm3, mm3 + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) + pi2fd mm4, mm4 + pi2fd mm1, mm1 + + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_3dnow) + +EXTN(jsimd_quantize_float_3dnow): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) + movd mm7, eax + punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F} + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm0, mm7 ; mm0=(00 ** 01 **) + pfadd mm1, mm7 ; mm1=(02 ** 03 **) + pfadd mm2, mm7 ; mm0=(04 ** 05 **) + pfadd mm3, mm7 ; mm1=(06 ** 07 **) + + movq mm4, mm0 + punpcklwd mm0, mm1 ; mm0=(00 02 ** **) + punpckhwd mm4, mm1 ; mm4=(01 03 ** **) + movq mm5, mm2 + punpcklwd mm2, mm3 ; mm2=(04 06 ** **) + punpckhwd mm5, mm3 ; mm5=(05 07 ** **) + + punpcklwd mm0, mm4 ; mm0=(00 01 02 03) + punpcklwd mm2, mm5 ; mm2=(04 05 06 07) + + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm6, mm7 ; mm0=(10 ** 11 **) + pfadd mm1, mm7 ; mm4=(12 ** 13 **) + pfadd mm3, mm7 ; mm0=(14 ** 15 **) + pfadd mm4, mm7 ; mm4=(16 ** 17 **) + + movq mm5, mm6 + punpcklwd mm6, mm1 ; mm6=(10 12 ** **) + punpckhwd mm5, mm1 ; mm5=(11 13 ** **) + movq mm1, mm3 + punpcklwd mm3, mm4 ; mm3=(14 16 ** **) + punpckhwd mm1, mm4 ; mm1=(15 17 ** **) + + punpcklwd mm6, mm5 ; mm6=(10 11 12 13) + punpcklwd mm3, mm1 ; mm3=(14 15 16 17) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm new file mode 100644 index 0000000000..61305c625d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm @@ -0,0 +1,276 @@ +; +; jquant.asm - sample data conversion and quantization (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_mmx) + +EXTN(jsimd_convsamp_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor mm6, mm6 ; mm6=(all 0's) + pcmpeqw mm7, mm7 + psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) + + movq mm4, mm0 + punpcklbw mm0, mm6 ; mm0=(0123) + punpckhbw mm4, mm6 ; mm4=(4567) + movq mm5, mm1 + punpcklbw mm1, mm6 ; mm1=(89AB) + punpckhbw mm5, mm6 ; mm5=(CDEF) + + paddw mm0, mm7 + paddw mm4, mm7 + paddw mm1, mm7 + paddw mm5, mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 + + movq mm0, mm2 + punpcklbw mm2, mm6 ; mm2=(GHIJ) + punpckhbw mm0, mm6 ; mm0=(KLMN) + movq mm4, mm3 + punpcklbw mm3, mm6 ; mm3=(OPQR) + punpckhbw mm4, mm6 ; mm4=(STUV) + + paddw mm2, mm7 + paddw mm0, mm7 + paddw mm3, mm7 + paddw mm4, mm7 + + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) +%define SHIFT(m, n, b) \ + MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_mmx) + +EXTN(jsimd_quantize_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov ah, 2 + alignx 16, 7 +.quantloop1: + mov al, DCTSIZE2/8/2 + alignx 16, 7 +.quantloop2: + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] + + movq mm0, mm2 + movq mm1, mm3 + + psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise + psraw mm3, (WORD_BIT-1) + + pxor mm0, mm2 ; val = -val + pxor mm1, mm3 + psubw mm0, mm2 + psubw mm1, mm3 + + ; + ; MMX is an annoyingly crappy instruction set. It has two + ; misfeatures that are causing problems here: + ; + ; - All multiplications are signed. + ; + ; - The second operand for the shifts is not treated as packed. + ; + ; + ; We work around the first problem by implementing this algorithm: + ; + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) + ; { + ; enum { SHORT_BIT = 16 }; + ; signed short sx = (signed short)x; + ; signed short sy = (signed short)y; + ; signed long sz; + ; + ; sz = (long)sx * (long)sy; /* signed multiply */ + ; + ; if (sx < 0) sz += (long)sy << SHORT_BIT; + ; if (sy < 0) sz += (long)sx << SHORT_BIT; + ; + ; return (unsigned long)sz; + ; } + ; + ; (note that a negative sx adds _sy_ and vice versa) + ; + ; For the second problem, we replace the shift by a multiplication. + ; Unfortunately that means we have to deal with the signed issue again. + ; + + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw mm1, MMWORD [CORRECTION(0,1,edx)] + + movq mm4, mm0 ; store current value for later + movq mm5, mm1 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] + paddw mm0, mm4 ; reciprocal is always negative (MSB=1), + paddw mm1, mm5 ; so we always need to add the initial value + ; (input value is never negative as we + ; inverted it at the start of this routine) + + ; here it gets a bit tricky as both scale + ; and mm0/mm1 can be negative + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale + movq mm7, MMWORD [SCALE(0,1,edx)] + movq mm4, mm0 + movq mm5, mm1 + pmulhw mm0, mm6 + pmulhw mm1, mm7 + + psraw mm6, (WORD_BIT-1) ; determine if scale is negative + psraw mm7, (WORD_BIT-1) + + pand mm6, mm4 ; and add input if it is + pand mm7, mm5 + paddw mm0, mm6 + paddw mm1, mm7 + + psraw mm4, (WORD_BIT-1) ; then check if negative input + psraw mm5, (WORD_BIT-1) + + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is + pand mm5, MMWORD [SCALE(0,1,edx)] + paddw mm0, mm4 + paddw mm1, mm5 + + pxor mm0, mm2 ; val = -val + pxor mm1, mm3 + psubw mm0, mm2 + psubw mm1, mm3 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 + + add esi, byte 8*SIZEOF_DCTELEM + add edx, byte 8*SIZEOF_DCTELEM + add edi, byte 8*SIZEOF_JCOEF + dec al + jnz near .quantloop2 + dec ah + jnz near .quantloop1 ; to avoid branch misprediction + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm new file mode 100644 index 0000000000..218adc976f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm @@ -0,0 +1,208 @@ +; +; jquant.asm - sample data conversion and quantization (SSE & MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse) + +EXTN(jsimd_convsamp_float_sse): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7, mm7 + psllw mm7, 7 + packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0, mm7 ; mm0=(01234567) + psubb mm1, mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4, mm2 ; mm4=(***0***1) + punpckhwd mm2, mm2 ; mm2=(***2***3) + punpcklwd mm5, mm0 ; mm5=(***4***5) + punpckhwd mm0, mm0 ; mm0=(***6***7) + + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) + cvtpi2ps xmm0, mm4 ; xmm0=(01**) + cvtpi2ps xmm1, mm2 ; xmm1=(23**) + psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) + cvtpi2ps xmm2, mm5 ; xmm2=(45**) + cvtpi2ps xmm3, mm0 ; xmm3=(67**) + + punpcklwd mm6, mm3 ; mm6=(***8***9) + punpckhwd mm3, mm3 ; mm3=(***A***B) + punpcklwd mm4, mm1 ; mm4=(***C***D) + punpckhwd mm1, mm1 ; mm1=(***E***F) + + psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) + cvtpi2ps xmm4, mm6 ; xmm4=(89**) + cvtpi2ps xmm5, mm3 ; xmm5=(AB**) + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) + cvtpi2ps xmm6, mm4 ; xmm6=(CD**) + cvtpi2ps xmm7, mm1 ; xmm7=(EF**) + + movlhps xmm0, xmm1 ; xmm0=(0123) + movlhps xmm2, xmm3 ; xmm2=(4567) + movlhps xmm4, xmm5 ; xmm4=(89AB) + movlhps xmm6, xmm7 ; xmm6=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse) + +EXTN(jsimd_quantize_float_sse): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + movhlps xmm4, xmm0 + movhlps xmm5, xmm1 + + cvtps2pi mm0, xmm0 + cvtps2pi mm1, xmm1 + cvtps2pi mm4, xmm4 + cvtps2pi mm5, xmm5 + + movhlps xmm6, xmm2 + movhlps xmm7, xmm3 + + cvtps2pi mm2, xmm2 + cvtps2pi mm3, xmm3 + cvtps2pi mm6, xmm6 + cvtps2pi mm7, xmm7 + + packssdw mm0, mm4 + packssdw mm1, mm5 + packssdw mm2, mm6 + packssdw mm3, mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm new file mode 100644 index 0000000000..a881ab50f9 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm @@ -0,0 +1,168 @@ +; +; jquantf.asm - sample data conversion and quantization (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm new file mode 100644 index 0000000000..5ed6bec246 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm @@ -0,0 +1,188 @@ +; +; jquanti.asm - sample data conversion and quantization (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, 2018, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + vinserti128 ymm0, ymm0, xmm1, 1 + vinserti128 ymm2, ymm2, xmm3, 1 + vinserti128 ymm4, ymm4, xmm5, 1 + vinserti128 ymm6, ymm6, xmm7, 1 + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpunpcklbw ymm0, ymm0, ymm1 + vpunpcklbw ymm2, ymm2, ymm1 + vpunpcklbw ymm4, ymm4, ymm1 + vpunpcklbw ymm6, ymm6, ymm1 + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm4, ymm4, ymm7 + vpaddw ymm6, ymm6, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 + vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + + vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm new file mode 100644 index 0000000000..0a509408aa --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm @@ -0,0 +1,201 @@ +; +; jquanti.asm - sample data conversion and quantization (SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16, 7 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c new file mode 100644 index 0000000000..80bc821ff4 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c @@ -0,0 +1,1246 @@ +/* + * jsimd_i386.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "jconfigint.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ +#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ + +static unsigned int simd_support = (unsigned int)(~0); +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char env[2] = { 0 }; +#endif + + if (simd_support != ~0U) + return; + + simd_support = jpeg_simd_cpu_support(); + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1")) + simd_support &= JSIMD_MMX; + if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1")) + simd_support &= JSIMD_3DNOW | JSIMD_MMX; + if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1")) + simd_support &= JSIMD_SSE | JSIMD_MMX; + if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1")) + simd_support &= JSIMD_SSE2; + if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1")) + simd_support &= JSIMD_AVX2; + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) + simd_support = 0; + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_ycc_convert_avx2; + sse2fct = jsimd_extrgb_ycc_convert_sse2; + mmxfct = jsimd_extrgb_ycc_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_ycc_convert_avx2; + sse2fct = jsimd_extrgbx_ycc_convert_sse2; + mmxfct = jsimd_extrgbx_ycc_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_ycc_convert_avx2; + sse2fct = jsimd_extbgr_ycc_convert_sse2; + mmxfct = jsimd_extbgr_ycc_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_ycc_convert_avx2; + sse2fct = jsimd_extbgrx_ycc_convert_sse2; + mmxfct = jsimd_extbgrx_ycc_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_ycc_convert_avx2; + sse2fct = jsimd_extxbgr_ycc_convert_sse2; + mmxfct = jsimd_extxbgr_ycc_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_ycc_convert_avx2; + sse2fct = jsimd_extxrgb_ycc_convert_sse2; + mmxfct = jsimd_extxrgb_ycc_convert_mmx; + break; + default: + avx2fct = jsimd_rgb_ycc_convert_avx2; + sse2fct = jsimd_rgb_ycc_convert_sse2; + mmxfct = jsimd_rgb_ycc_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_gray_convert_avx2; + sse2fct = jsimd_extrgb_gray_convert_sse2; + mmxfct = jsimd_extrgb_gray_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_gray_convert_avx2; + sse2fct = jsimd_extrgbx_gray_convert_sse2; + mmxfct = jsimd_extrgbx_gray_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_gray_convert_avx2; + sse2fct = jsimd_extbgr_gray_convert_sse2; + mmxfct = jsimd_extbgr_gray_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_gray_convert_avx2; + sse2fct = jsimd_extbgrx_gray_convert_sse2; + mmxfct = jsimd_extbgrx_gray_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_gray_convert_avx2; + sse2fct = jsimd_extxbgr_gray_convert_sse2; + mmxfct = jsimd_extxbgr_gray_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_gray_convert_avx2; + sse2fct = jsimd_extxrgb_gray_convert_sse2; + mmxfct = jsimd_extxrgb_gray_convert_mmx; + break; + default: + avx2fct = jsimd_rgb_gray_convert_avx2; + sse2fct = jsimd_rgb_gray_convert_sse2; + mmxfct = jsimd_rgb_gray_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_ycc_extrgb_convert_avx2; + sse2fct = jsimd_ycc_extrgb_convert_sse2; + mmxfct = jsimd_ycc_extrgb_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_ycc_extrgbx_convert_avx2; + sse2fct = jsimd_ycc_extrgbx_convert_sse2; + mmxfct = jsimd_ycc_extrgbx_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_ycc_extbgr_convert_avx2; + sse2fct = jsimd_ycc_extbgr_convert_sse2; + mmxfct = jsimd_ycc_extbgr_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_ycc_extbgrx_convert_avx2; + sse2fct = jsimd_ycc_extbgrx_convert_sse2; + mmxfct = jsimd_ycc_extbgrx_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_ycc_extxbgr_convert_avx2; + sse2fct = jsimd_ycc_extxbgr_convert_sse2; + mmxfct = jsimd_ycc_extxbgr_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_ycc_extxrgb_convert_avx2; + sse2fct = jsimd_ycc_extxrgb_convert_sse2; + mmxfct = jsimd_ycc_extxrgb_convert_mmx; + break; + default: + avx2fct = jsimd_ycc_rgb_convert_avx2; + sse2fct = jsimd_ycc_rgb_convert_sse2; + mmxfct = jsimd_ycc_rgb_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else + mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx; + break; + default: + avx2fct = jsimd_h2v2_merged_upsample_avx2; + sse2fct = jsimd_h2v2_merged_upsample_sse2; + mmxfct = jsimd_h2v2_merged_upsample_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx; + break; + default: + avx2fct = jsimd_h2v1_merged_upsample_avx2; + sse2fct = jsimd_h2v1_merged_upsample_sse2; + mmxfct = jsimd_h2v1_merged_upsample_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE2) + jsimd_convsamp_sse2(sample_data, start_col, workspace); + else + jsimd_convsamp_mmx(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_convsamp_float_sse(sample_data, start_col, workspace); + else + jsimd_convsamp_float_3dnow(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_fdct_islow_avx2(data); + else if (simd_support & JSIMD_SSE2) + jsimd_fdct_islow_sse2(data); + else + jsimd_fdct_islow_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + jsimd_fdct_ifast_sse2(data); + else + jsimd_fdct_ifast_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + jsimd_fdct_float_sse(data); + else if (simd_support & JSIMD_3DNOW) + jsimd_fdct_float_3dnow(data); +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE2) + jsimd_quantize_sse2(coef_block, divisors, workspace); + else + jsimd_quantize_mmx(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_quantize_float_sse2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_quantize_float_sse(coef_block, divisors, workspace); + else + jsimd_quantize_float_3dnow(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_AVX2) + jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_SSE2) + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm new file mode 100644 index 0000000000..ddcafa9e21 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm @@ -0,0 +1,135 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support(void) +; + + align 32 + GLOBAL_FUNCTION(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused + push edi + + xor edi, edi ; simd support flag + + pushfd + pop eax + mov edx, eax + xor eax, 1<<21 ; flip ID bit in EFLAGS + push eax + popfd + pushfd + pop eax + xor eax, edx + jz near .return ; CPUID is not supported + + ; Check whether CPUID leaf 07H is supported + ; (leaf 07H is used to check for AVX2 instruction support) + xor eax, eax + cpuid + test eax, eax + jz near .return + cmp eax, 7 + jl short .no_avx2 ; Maximum leaf < 07H + + ; Check for AVX2 instruction support + mov eax, 7 + xor ecx, ecx + cpuid + mov eax, ebx + test eax, 1<<5 ; bit5:AVX2 + jz short .no_avx2 + + ; Check for AVX2 O/S support + mov eax, 1 + xor ecx, ecx + cpuid + test ecx, 1<<27 + jz short .no_avx2 ; O/S does not support XSAVE + test ecx, 1<<28 + jz short .no_avx2 ; CPU does not support AVX2 + + xor ecx, ecx + xgetbv + and eax, 6 + cmp eax, 6 ; O/S does not manage XMM/YMM state + ; using XSAVE + jnz short .no_avx2 + + or edi, JSIMD_AVX2 +.no_avx2: + + ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support + xor eax, eax + inc eax + cpuid + mov eax, edx ; eax = Standard feature flags + + ; Check for MMX instruction support + test eax, 1<<23 ; bit23:MMX + jz short .no_mmx + or edi, byte JSIMD_MMX +.no_mmx: + test eax, 1<<25 ; bit25:SSE + jz short .no_sse + or edi, byte JSIMD_SSE +.no_sse: + test eax, 1<<26 ; bit26:SSE2 + jz short .no_sse2 + or edi, byte JSIMD_SSE2 +.no_sse2: + + ; Check for 3DNow! instruction support + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000000 + jbe short .return + + mov eax, 0x80000001 + cpuid + mov eax, edx ; eax = Extended feature flags + + test eax, 1<<31 ; bit31:3DNow!(vendor independent) + jz short .no_3dnow + or edi, byte JSIMD_3DNOW +.no_3dnow: + +.return: + mov eax, edi + + pop edi +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/jsimd.h b/3rdparty/libjpeg-turbo/src/simd/jsimd.h new file mode 100644 index 0000000000..64747c6360 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/jsimd.h @@ -0,0 +1,1258 @@ +/* + * simd/jsimd.h + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2014, Linaro Limited. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * Copyright (C) 2020, Arm Limited. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +/* Bitmask for supported acceleration methods */ + +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 +#define JSIMD_SSE2 0x08 +#define JSIMD_NEON 0x10 +#define JSIMD_DSPR2 0x20 +#define JSIMD_ALTIVEC 0x40 +#define JSIMD_AVX2 0x80 +#define JSIMD_MMI 0x100 + +/* SIMD Ext: retrieve SIMD/CPU information */ +EXTERN(unsigned int) jpeg_simd_cpu_support(void); + +/* RGB & extended RGB --> YCC Colorspace Conversion */ +EXTERN(void) jsimd_rgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_sse2[]; +EXTERN(void) jsimd_rgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_avx2[]; +EXTERN(void) jsimd_rgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +#ifndef NEON_INTRINSICS + +EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +#endif + +EXTERN(void) jsimd_rgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* RGB & extended RGB --> Grayscale Colorspace Conversion */ +EXTERN(void) jsimd_rgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_avx2[]; +EXTERN(void) jsimd_rgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* YCC --> RGB & extended RGB Colorspace Conversion */ +EXTERN(void) jsimd_ycc_rgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_sse2[]; +EXTERN(void) jsimd_ycc_rgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_avx2[]; +EXTERN(void) jsimd_ycc_rgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +#ifndef NEON_INTRINSICS + +EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +#endif + +EXTERN(void) jsimd_ycc_rgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +/* NULL Colorspace Conversion */ +EXTERN(void) jsimd_c_null_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows, int num_components); + +/* h2v1 Downsampling */ +EXTERN(void) jsimd_h2v1_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_avx2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Downsampling */ +EXTERN(void) jsimd_h2v2_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_avx2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_mmi + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Smooth Downsampling */ +EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2 + (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor, + int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks, + JDIMENSION image_width); + + +/* Upsampling */ +EXTERN(void) jsimd_h2v1_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_avx2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_avx2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_int_upsample_dspr2 + (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr, JDIMENSION output_width, + int max_v_samp_factor); + +EXTERN(void) jsimd_h2v1_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Fancy Upsampling */ +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_avx2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_avx2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_avx2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h1v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_mmi + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mmi + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Merged Upsampling */ +EXTERN(void) jsimd_h2v1_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_avx2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); + +EXTERN(void) jsimd_h2v2_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); + +EXTERN(void) jsimd_h2v1_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +/* Sample Conversion */ +EXTERN(void) jsimd_convsamp_mmx + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_avx2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_neon + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_altivec + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +/* Floating Point Sample Conversion */ +EXTERN(void) jsimd_convsamp_float_3dnow + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +/* Accurate Integer Forward DCT */ +EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data); + +extern const int jconst_fdct_islow_sse2[]; +EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data); + +extern const int jconst_fdct_islow_avx2[]; +EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data); + +/* Fast Integer Forward DCT */ +EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data); + +extern const int jconst_fdct_ifast_sse2[]; +EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data); + +/* Floating Point Forward DCT */ +EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data); + +extern const int jconst_fdct_float_sse[]; +EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data); + +/* Quantization */ +EXTERN(void) jsimd_quantize_mmx + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_sse2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_avx2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_neon + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_dspr2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_mmi + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_altivec + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +/* Floating Point Quantization */ +EXTERN(void) jsimd_quantize_float_3dnow + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_dspr2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +/* Scaled Inverse DCT */ +EXTERN(void) jsimd_idct_2x2_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_red_sse2[]; +EXTERN(void) jsimd_idct_2x2_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col, int *workspace); +EXTERN(void) jsimd_idct_6x6_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12_pass1_dspr2 + (JCOEFPTR coef_block, void *dct_table, int *workspace); +EXTERN(void) jsimd_idct_12x12_pass2_dspr2 + (int *workspace, int *output); + +/* Accurate Integer Inverse DCT */ +EXTERN(void) jsimd_idct_islow_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_sse2[]; +EXTERN(void) jsimd_idct_islow_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_avx2[]; +EXTERN(void) jsimd_idct_islow_avx2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_dspr2 + (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col); + +EXTERN(void) jsimd_idct_islow_mmi + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Fast Integer Inverse DCT */ +EXTERN(void) jsimd_idct_ifast_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_ifast_sse2[]; +EXTERN(void) jsimd_idct_ifast_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_cols_dspr2 + (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr, + const int *idct_coefs); +EXTERN(void) jsimd_idct_ifast_rows_dspr2 + (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, + const int *idct_coefs); + +EXTERN(void) jsimd_idct_ifast_mmi + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Floating Point Inverse DCT */ +EXTERN(void) jsimd_idct_float_3dnow + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse[]; +EXTERN(void) jsimd_idct_float_sse + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse2[]; +EXTERN(void) jsimd_idct_float_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Huffman coding */ +extern const int jconst_huff_encode_one_block[]; +EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2 + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +#ifndef NEON_INTRINSICS + +EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +#endif + +/* Progressive Huffman encoding */ +EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); + +EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); + +EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); + +EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c new file mode 100644 index 0000000000..d2546eed32 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c @@ -0,0 +1,1147 @@ +/* + * jsimd_mips.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * MIPS architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include +#include +#include + +static unsigned int simd_support = ~0; + +#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__) + +LOCAL(void) +parse_proc_cpuinfo(const char *search_string) +{ + const char *file_name = "/proc/cpuinfo"; + char cpuinfo_line[256]; + FILE *f = NULL; + + simd_support = 0; + + if ((f = fopen(file_name, "r")) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + simd_support |= JSIMD_DSPR2; + return; + } + } + fclose(f); + } + /* Did not find string in the proc file, or not Linux ELF. */ +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) + simd_support |= JSIMD_DSPR2; +#elif defined(__linux__) + /* We still have a chance to use MIPS DSPR2 regardless of globally used + * -mdspr2 options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux */ + parse_proc_cpuinfo("MIPS 74K"); +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEDSPR2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_DSPR2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +static const int mips_idct_ifast_coefs[4] = { + 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */ + 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */ + 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */ + 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */ +}; + +/* The following struct is borrowed from jdsample.c */ +typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +typedef struct { + struct jpeg_upsampler pub; + JSAMPARRAY color_buf[MAX_COMPONENTS]; + upsample1_ptr methods[MAX_COMPONENTS]; + int next_row_out; + JDIMENSION rows_to_go; + int rowgroup_height[MAX_COMPONENTS]; + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_extrgb_ycc_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_extrgbx_ycc_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_extbgr_ycc_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_extbgrx_ycc_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_extxbgr_ycc_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_extxrgb_ycc_convert_dspr2; + break; + default: + dspr2fct = jsimd_extrgb_ycc_convert_dspr2; + break; + } + + dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_extrgb_gray_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_extrgbx_gray_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_extbgr_gray_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_extbgrx_gray_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_extxbgr_gray_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_extxrgb_gray_convert_dspr2; + break; + default: + dspr2fct = jsimd_extrgb_gray_convert_dspr2; + break; + } + + dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_ycc_extrgb_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_ycc_extrgbx_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_ycc_extbgr_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_ycc_extbgrx_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_ycc_extxbgr_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_ycc_extxrgb_convert_dspr2; + break; + default: + dspr2fct = jsimd_ycc_extrgb_convert_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf, + output_row, num_rows, cinfo->num_components); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling + * regression tests, probably because the DSPr2 SIMD implementation predates + * those tests. */ +#if 0 + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (DCTSIZE != 8) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling + * regression tests, probably because the DSPr2 SIMD implementation predates + * those tests. */ +#if 0 + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data, + compptr->v_samp_factor, + cinfo->max_v_samp_factor, + cinfo->smoothing_factor, + compptr->width_in_blocks, + cinfo->image_width); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample; + + jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index], + upsample->v_expand[compptr->component_index], + input_data, output_data_ptr, cinfo->output_width, + cinfo->max_v_samp_factor); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2; + break; + default: + dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2; + break; + default: + dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#ifndef __mips_soft_float + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +#ifndef __mips_soft_float + jsimd_convsamp_float_dspr2(sample_data, start_col, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#ifndef __mips_soft_float + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +#ifndef __mips_soft_float + jsimd_quantize_float_dspr2(coef_block, divisors, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12(void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int workspace[DCTSIZE * 4]; /* buffers data between passes */ + + jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col, + workspace); +} + +GLOBAL(void) +jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int workspace[96]; + int output[12] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + (int)(output_buf[8] + output_col), + (int)(output_buf[9] + output_col), + (int)(output_buf[10] + output_col), + (int)(output_buf[11] + output_col) + }; + + jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace); + jsimd_idct_12x12_pass2_dspr2(workspace, output); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int output[8] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col) + }; + + jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output, + IDCT_range_limit(cinfo)); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + JCOEFPTR inptr; + IFAST_MULT_TYPE *quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *)compptr->dct_table; + + jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace, + mips_idct_ifast_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col, + mips_idct_ifast_coefs); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S new file mode 100644 index 0000000000..c99288a8d1 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S @@ -0,0 +1,4543 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic + * Darko Laus + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "jsimd_dspr2_asm.h" + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_c_null_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + * 20(sp) = cinfo->num_components + * + * Null conversion for compression + */ + SAVE_REGS_ON_STACK 8, s0, s1 + + lw t9, 24(sp) /* t9 = num_rows */ + lw s0, 28(sp) /* s0 = cinfo->num_components */ + andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */ + beqz t0, 4f /* no residual */ + nop +0: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +1: + sll t3, t1, 2 + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ + sll t4, a3, 2 + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +2: + lbu t3, 0(t2) + addiu t5, t5, 1 + sb t3, -1(t5) + bne t6, t5, 2b + addu t2, t2, s0 +3: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 3b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 1b + nop + addiu a1, a1, 4 + bgez t9, 0b + addiu a3, a3, 1 + b 7f + nop +4: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +5: + sll t3, t1, 2 + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ + sll t4, a3, 2 + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +6: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 6b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 5b + nop + addiu a1, a1, 4 + bgez t9, 4b + addiu a3, a3, 1 +7: + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop + +END(jsimd_c_null_convert_dspr2) + + +/*****************************************************************************/ +/* + * jsimd_extrgb_ycc_convert_dspr2 + * jsimd_extbgr_ycc_convert_dspr2 + * jsimd_extrgbx_ycc_convert_dspr2 + * jsimd_extbgrx_ycc_convert_dspr2 + * jsimd_extxbgr_ycc_convert_dspr2 + * jsimd_extxrgb_ycc_convert_dspr2 + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs + +.macro DO_RGB_TO_YCC r, g, b, inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw t7, 48(sp) /* t7 = num_rows */ + li s0, 0x4c8b /* FIX(0.29900) */ + li s1, 0x9646 /* FIX(0.58700) */ + li s2, 0x1d2f /* FIX(0.11400) */ + li s3, 0xffffd4cd /* -FIX(0.16874) */ + li s4, 0xffffab33 /* -FIX(0.33126) */ + li s5, 0x8000 /* FIX(0.50000) */ + li s6, 0xffff94d1 /* -FIX(0.41869) */ + li s7, 0xffffeb2f /* -FIX(0.08131) */ + li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */ + +0: + addiu t7, -1 /* --num_rows */ + lw t6, 0(a1) /* t6 = input_buf[0] */ + lw t0, 0(a2) + lw t1, 4(a2) + lw t2, 8(a2) + sll t3, a3, 2 + lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */ + lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */ + lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */ + + addu t9, t2, a0 /* t9 = end address */ + addiu a3, 1 + +1: + DO_RGB_TO_YCC t3, t4, t5, t6 + + mtlo s5, $ac0 + mtlo t8, $ac1 + mtlo t8, $ac2 + maddu $ac0, s2, t5 + maddu $ac1, s5, t5 + maddu $ac2, s5, t3 + maddu $ac0, s0, t3 + maddu $ac1, s3, t3 + maddu $ac2, s6, t4 + maddu $ac0, s1, t4 + maddu $ac1, s4, t4 + maddu $ac2, s7, t5 + extr.w t3, $ac0, 16 + extr.w t4, $ac1, 16 + extr.w t5, $ac2, 16 + sb t3, 0(t0) + sb t4, 0(t1) + sb t5, 0(t2) + addiu t0, 1 + addiu t2, 1 + bne t2, t9, 1b + addiu t1, 1 + bgtz t7, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_ycc_convert_dspr2) + +.purgem DO_RGB_TO_YCC + +.endm + +/*-------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 + + +/*****************************************************************************/ +/* + * jsimd_ycc_extrgb_convert_dspr2 + * jsimd_ycc_extbgr_convert_dspr2 + * jsimd_ycc_extrgbx_convert_dspr2 + * jsimd_ycc_extbgrx_convert_dspr2 + * jsimd_ycc_extxbgr_convert_dspr2 + * jsimd_ycc_extxrgb_convert_dspr2 + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs, a_offs + +.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr + sb \scratch0, \r_offs(\outptr) + sb \scratch1, \g_offs(\outptr) + sb \scratch2, \b_offs(\outptr) +.if (\pixel_size == 4) + li t0, 0xFF + sb t0, \a_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = input_row + * a3 = output_buf + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s1, 48(sp) + li t3, 0x8000 + li t4, 0x166e9 /* FIX(1.40200) */ + li t5, 0x1c5a2 /* FIX(1.77200) */ + li t6, 0xffff492e /* -FIX(0.71414) */ + li t7, 0xffffa7e6 /* -FIX(0.34414) */ + repl.ph t8, 128 + +0: + lw s0, 0(a3) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + sll s5, a2, 2 + addiu s1, -1 + lwx s2, s5(t0) + lwx s3, s5(t1) + lwx s4, s5(t2) + addu t9, s2, a0 + addiu a2, 1 + +1: + lbu s7, 0(s4) /* cr */ + lbu s6, 0(s3) /* cb */ + lbu s5, 0(s2) /* y */ + addiu s2, 1 + addiu s4, 1 + addiu s7, -128 + addiu s6, -128 + mul t2, t7, s6 + mul t0, t6, s7 /* Crgtab[cr] */ + sll s7, 15 + mulq_rs.w t1, t4, s7 /* Crrtab[cr] */ + sll s6, 15 + addu t2, t3 /* Cbgtab[cb] */ + addu t2, t0 + + mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */ + sra t2, 16 + addu t1, s5 + addu t2, s5 /* add y */ + ins t2, t1, 16, 16 + subu.ph t2, t2, t8 + addu t0, s5 + shll_s.ph t2, t2, 8 + subu t0, 128 + shra.ph t2, t2, 8 + shll_s.w t0, t0, 24 + addu.ph t2, t2, t8 /* clip & store */ + sra t0, t0, 24 + sra t1, t2, 16 + addiu t0, 128 + + STORE_YCC_TO_RGB t1, t2, t0, s0 + + bne s2, t9, 1b + addiu s3, 1 + bgtz s1, 0b + addiu a3, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_ycc_\colorid\()_convert_dspr2) + +.purgem STORE_YCC_TO_RGB + +.endm + +/*-------------------------------------id -- pix R G B A */ +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 + + +/*****************************************************************************/ +/* + * jsimd_extrgb_gray_convert_dspr2 + * jsimd_extbgr_gray_convert_dspr2 + * jsimd_extrgbx_gray_convert_dspr2 + * jsimd_extbgrx_gray_convert_dspr2 + * jsimd_extxbgr_gray_convert_dspr2 + * jsimd_extxrgb_gray_convert_dspr2 + * + * Colorspace conversion RGB -> GRAY + */ + +.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs + +.macro DO_RGB_TO_GRAY r, g, b, inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + li s0, 0x4c8b /* s0 = FIX(0.29900) */ + li s1, 0x9646 /* s1 = FIX(0.58700) */ + li s2, 0x1d2f /* s2 = FIX(0.11400) */ + li s7, 0x8000 /* s7 = FIX(0.50000) */ + lw s6, 48(sp) + andi t7, a0, 3 + +0: + addiu s6, -1 /* s6 = num_rows */ + lw t0, 0(a1) + lw t1, 0(a2) + sll t3, a3, 2 + lwx t1, t3(t1) + addiu a3, 1 + addu t9, t1, a0 + subu t8, t9, t7 + beq t1, t8, 2f + nop + +1: + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t6, $ac0, 16 + + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + extr.w t2, $ac1, 16 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t5, $ac0, 16 + sb t6, 0(t1) + sb t2, 1(t1) + extr.w t3, $ac1, 16 + addiu t1, 4 + sb t5, -2(t1) + sb t3, -1(t1) + bne t1, t8, 1b + nop + +2: + beqz t7, 4f + nop + +3: + DO_RGB_TO_GRAY t3, t4, t5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + extr.w t6, $ac0, 16 + sb t6, 0(t1) + addiu t1, 1 + bne t1, t9, 3b + nop + +4: + bgtz s6, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_gray_convert_dspr2) + +.purgem DO_RGB_TO_GRAY + +.endm + +/*-------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 + + +/*****************************************************************************/ +/* + * jsimd_h2v2_merged_upsample_dspr2 + * jsimd_h2v2_extrgb_merged_upsample_dspr2 + * jsimd_h2v2_extrgbx_merged_upsample_dspr2 + * jsimd_h2v2_extbgr_merged_upsample_dspr2 + * jsimd_h2v2_extbgrx_merged_upsample_dspr2 + * jsimd_h2v2_extxbgr_merged_upsample_dspr2 + * jsimd_h2v2_extxrgb_merged_upsample_dspr2 + * + * Merged h2v2 upsample routines + */ +.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ + r1_offs, g1_offs, \ + b1_offs, a1_offs, \ + r2_offs, g2_offs, \ + b2_offs, a2_offs + +.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ + scratch5 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li \scratch0, 0xFF + sb \scratch0, \a1_offs(\outptr) + sb \scratch0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) +/* + * a0 = cinfo->output_width + * a1 = input_buf + * a2 = in_row_group_ctr + * a3 = output_buf + * 16(sp) = cinfo->sample_range_limit + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + lw t9, 56(sp) /* cinfo->sample_range_limit */ + lw v0, 0(a1) + lw v1, 4(a1) + lw t0, 8(a1) + sll t1, a2, 3 + addiu t2, t1, 4 + sll t3, a2, 2 + lw t4, 0(a3) /* t4 = output_buf[0] */ + lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */ + lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */ + lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */ + lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */ + lw t7, 4(a3) /* t7 = output_buf[1] */ + li s1, 0xe6ea + addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */ + addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */ + addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ + srl t3, a0, 1 + blez t3, 2f + addu t0, t5, t3 /* t0 = end address */ + 1: + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t5, t5, 1 + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ + extr_r.w s5, $ac1, 16 + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ + lbu v0, 0(t1) + addiu t6, t6, 1 + addiu t1, t1, 2 + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, -1(t1) + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 + + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, 1(t2) + addiu t2, t2, 2 + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 + + bne t0, t5, 1b + nop +2: + andi t0, a0, 1 + beqz t0, 4f + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + lbu v0, 0(t1) + extr_r.w s5, $ac1, 16 + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_1_PIXEL t3, s3, v1, t4 + + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_1_PIXEL t3, s3, v1, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) + +.purgem STORE_H2V2_1_PIXEL +.purgem STORE_H2V2_2_PIXELS +.endm + +/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 + + +/*****************************************************************************/ +/* + * jsimd_h2v1_merged_upsample_dspr2 + * jsimd_h2v1_extrgb_merged_upsample_dspr2 + * jsimd_h2v1_extrgbx_merged_upsample_dspr2 + * jsimd_h2v1_extbgr_merged_upsample_dspr2 + * jsimd_h2v1_extbgrx_merged_upsample_dspr2 + * jsimd_h2v1_extxbgr_merged_upsample_dspr2 + * jsimd_h2v1_extxrgb_merged_upsample_dspr2 + * + * Merged h2v1 upsample routines + */ + +.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ + r1_offs, g1_offs, \ + b1_offs, a1_offs, \ + r2_offs, g2_offs, \ + b2_offs, a2_offs + +.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ + scratch5 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) + sb t0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) +/* + * a0 = cinfo->output_width + * a1 = input_buf + * a2 = in_row_group_ctr + * a3 = output_buf + * 16(sp) = range_limit + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + li t0, 0xe6ea + lw t1, 0(a1) /* t1 = input_buf[0] */ + lw t2, 4(a1) /* t2 = input_buf[1] */ + lw t3, 8(a1) /* t3 = input_buf[2] */ + lw t8, 56(sp) /* t8 = range_limit */ + addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */ + addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */ + addiu s0, t0, 0x9916 /* s0 = 0x8000 */ + addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ + srl t0, a0, 1 + sll t4, a2, 2 + lwx s5, t4(t1) /* s5 = inptr0 */ + lwx s6, t4(t2) /* s6 = inptr1 */ + lwx s7, t4(t3) /* s7 = inptr2 */ + lw t7, 0(a3) /* t7 = outptr */ + blez t0, 2f + addu t9, s6, t0 /* t9 = end address */ +1: + lbu t2, 0(s6) /* t2 = cb */ + lbu t0, 0(s7) /* t0 = cr */ + lbu t1, 0(s5) /* t1 = y */ + addiu t2, t2, -128 /* t2 = cb - 128 */ + addiu t0, t0, -128 /* t0 = cr - 128 */ + mult $ac1, s4, t2 + madd $ac1, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */ + extr_r.w t5, $ac1, 16 + mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */ + addiu s7, s7, 1 + addiu s6, s6, 1 + addu t2, t1, t0 /* t2 = y + cred */ + addu t3, t1, t5 /* t3 = y + cgreen */ + addu t4, t1, t6 /* t4 = y + cblue */ + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t1, 1(s5) + lbu v0, 0(t2) + lbu v1, 0(t3) + lbu ra, 0(t4) + addu t2, t1, t0 + addu t3, t1, t5 + addu t4, t1, t6 + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 + + bne t9, s6, 1b + addiu s5, s5, 2 +2: + andi t0, a0, 1 + beqz t0, 4f + nop +3: + lbu t2, 0(s6) + lbu t0, 0(s7) + lbu t1, 0(s5) + addiu t2, t2, -128 /* (cb - 128) */ + addiu t0, t0, -128 /* (cr - 128) */ + mul t3, s4, t2 + mul t4, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */ + addu t3, t3, s0 + addu t3, t4, t3 + sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */ + addu t2, t1, t0 /* y + cred */ + addu t3, t1, t5 /* y + cgreen */ + addu t4, t1, t6 /* y + cblue */ + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_1_PIXEL t2, t3, t4, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) + +.purgem STORE_H2V1_1_PIXEL +.purgem STORE_H2V1_2_PIXELS +.endm + +/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 + + +/*****************************************************************************/ +/* + * jsimd_h2v2_fancy_upsample_dspr2 + * + * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + */ +LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = downsampled_width + * a2 = input_data + * a3 = output_data_ptr + */ + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + li s4, 0 + lw s2, 0(a3) /* s2 = *output_data_ptr */ +0: + li t9, 2 + lw s1, -4(a2) /* s1 = inptr1 */ + +1: + lw s0, 0(a2) /* s0 = inptr0 */ + lwx s3, s4(s2) + addiu s5, a1, -2 /* s5 = downsampled_width - 2 */ + srl t4, s5, 1 + sll t4, t4, 1 + lbu t0, 0(s0) + lbu t1, 1(s0) + lbu t2, 0(s1) + lbu t3, 1(s1) + addiu s0, 2 + addiu s1, 2 + addu t8, s0, t4 /* t8 = end address */ + andi s5, s5, 1 /* s5 = residual */ + sll t4, t0, 1 + sll t6, t1, 1 + addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */ + addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */ + addu t7, t0, t2 /* t7 = thiscolsum */ + addu t6, t1, t3 /* t5 = nextcolsum */ + sll t0, t7, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t7 /* t1 = thiscolsum * 3 */ + shra_r.w t0, t0, 4 + addiu t1, 7 + addu t1, t1, t6 + srl t1, t1, 4 + sb t0, 0(s3) + sb t1, 1(s3) + beq t8, s0, 22f /* skip to final iteration if width == 3 */ + addiu s3, 2 +2: + lh t0, 0(s0) /* t0 = A3|A2 */ + lh t2, 0(s1) /* t2 = B3|B2 */ + addiu s0, 2 + addiu s1, 2 + preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */ + preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */ + shll.ph t1, t0, 1 + sll t3, t6, 1 + addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */ + addu t3, t3, t6 /* t3 = this * 3 */ + addu.ph t0, t0, t2 /* t0 = next2|next1 */ + addu t1, t3, t7 + andi t7, t0, 0xFFFF /* t7 = next1 */ + sll t2, t7, 1 + addu t2, t7, t2 /* t2 = next1*3 */ + addu t4, t2, t6 + srl t6, t0, 16 /* t6 = next2 */ + shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */ + addu t0, t3, t7 + addiu t0, 7 + srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */ + shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */ + addu t2, t2, t6 + addiu t2, 7 + srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */ + sb t1, 0(s3) + sb t0, 1(s3) + sb t4, 2(s3) + sb t2, 3(s3) + bne t8, s0, 2b + addiu s3, 4 +22: + beqz s5, 4f + addu t8, s0, s5 +3: + lbu t0, 0(s0) + lbu t2, 0(s1) + addiu s0, 1 + addiu s1, 1 + sll t3, t6, 1 + sll t1, t0, 1 + addu t1, t0, t1 /* t1 = inptr0 * 3 */ + addu t3, t3, t6 /* t3 = thiscolsum * 3 */ + addu t5, t1, t2 + addu t1, t3, t7 + shra_r.w t1, t1, 4 + addu t0, t3, t5 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu s3, 2 + move t7, t6 + bne t8, s0, 3b + move t6, t5 +4: + sll t0, t6, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t6 /* t1 = thiscolsum * 3 */ + addu t1, t1, t7 + addiu s4, 4 + shra_r.w t1, t1, 4 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu t9, -1 + addiu s3, 2 + bnez t9, 1b + lw s1, 4(a2) + srl t0, s4, 2 + subu t0, a0, t0 + bgtz t0, 0b + addiu a2, 4 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop +END(jsimd_h2v2_fancy_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = downsampled_width + * a2 = input_data + * a3 = output_data_ptr + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + .set at + + beqz a0, 3f + sll t0, a0, 2 + lw s1, 0(a3) + li s3, 0x10001 + addu s0, s1, t0 +0: + addiu t8, a1, -2 + srl t9, t8, 2 + lw t7, 0(a2) + lw s2, 0(s1) + lbu t0, 0(t7) + lbu t1, 1(t7) /* t1 = inptr[1] */ + sll t2, t0, 1 + addu t2, t2, t0 /* t2 = invalue*3 */ + addu t2, t2, t1 + shra_r.w t2, t2, 2 + sb t0, 0(s2) + sb t2, 1(s2) + beqz t9, 11f + addiu s2, 2 +1: + ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */ + ulw t1, 1(t7) + ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */ + preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */ + preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */ + preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */ + preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */ + preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */ + shll.ph t5, t4, 1 + shll.ph t6, t1, 1 + addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */ + addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */ + addu.ph t4, t3, s3 + addu.ph t0, t0, s3 + addu.ph t4, t4, t5 + addu.ph t0, t0, t6 + shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */ + shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */ + addu.ph t2, t2, t5 + addu.ph t3, t3, t6 + shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */ + shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */ + shll.ph t2, t2, 8 + shll.ph t3, t3, 8 + or t2, t4, t2 + or t3, t3, t0 + addiu t9, -1 + usw t3, 0(s2) + usw t2, 4(s2) + addiu s2, 8 + bgtz t9, 1b + addiu t7, 4 +11: + andi t8, 3 + beqz t8, 22f + addiu t7, 1 + +2: + lbu t0, 0(t7) + addiu t7, 1 + sll t1, t0, 1 + addu t2, t0, t1 /* t2 = invalue */ + lbu t3, -2(t7) + lbu t4, 0(t7) + addiu t3, 1 + addiu t4, 2 + addu t3, t3, t2 + addu t4, t4, t2 + srl t3, 2 + srl t4, 2 + sb t3, 0(s2) + sb t4, 1(s2) + addiu t8, -1 + bgtz t8, 2b + addiu s2, 2 + +22: + lbu t0, 0(t7) + lbu t2, -1(t7) + sll t1, t0, 1 + addu t1, t1, t0 /* t1 = invalue * 3 */ + addu t1, t1, t2 + addiu t1, 1 + srl t1, t1, 2 + sb t1, 0(s2) + sb t0, 1(s2) + addiu s1, 4 + bne s1, s0, 0b + addiu a2, 4 +3: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_h2v1_fancy_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) +/* + * a0 = cinfo->image_width + * a1 = cinfo->max_v_samp_factor + * a2 = compptr->v_samp_factor + * a3 = compptr->width_in_blocks + * 16(sp) = input_data + * 20(sp) = output_data + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 + + beqz a2, 7f + lw s1, 44(sp) /* s1 = output_data */ + lw s0, 40(sp) /* s0 = input_data */ + srl s2, a0, 2 + andi t9, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 /* t0 = width_in_blocks*DCT */ + srl t7, t0, 1 + subu s2, t7, s2 +0: + andi t6, a0, 1 /* t6 = temp_index */ + addiu t6, -1 + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + li s3, 0 /* s3 = bias */ + srl t7, a0, 1 /* t7 = image_width1 */ + srl s4, t7, 2 + andi t8, t7, 3 +1: + ulhu t0, 0(t5) + ulhu t1, 2(t5) + ulhu t2, 4(t5) + ulhu t3, 6(t5) + raddu.w.qb t0, t0 + raddu.w.qb t1, t1 + raddu.w.qb t2, t2 + raddu.w.qb t3, t3 + shra.ph t0, t0, 1 + shra_r.ph t1, t1, 1 + shra.ph t2, t2, 1 + shra_r.ph t3, t3, 1 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t3, 3(t4) + addiu s4, -1 + addiu t4, 4 + bgtz s4, 1b + addiu t5, 8 + beqz t8, 3f + addu s4, t4, t8 +2: + ulhu t0, 0(t5) + raddu.w.qb t0, t0 + addqh.w t0, t0, s3 + xori s3, s3, 1 + sb t0, 0(t4) + addiu t4, 1 + bne t4, s4, 2b + addiu t5, 2 +3: + lbux t1, t6(t5) + sll t1, 1 + addqh.w t2, t1, s3 /* t2 = pixval1 */ + xori s3, s3, 1 + addqh.w t3, t1, s3 /* t3 = pixval2 */ + blez s2, 5f + append t3, t2, 8 + addu t5, t4, s2 /* t5 = loop_end2 */ +4: + ush t3, 0(t4) + addiu s2, -1 + bgtz s2, 4b + addiu t4, 2 +5: + beqz t9, 6f + nop + sb t2, 0(t4) +6: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 4 +7: + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 + + j ra + nop +END(jsimd_h2v1_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) +/* + * a0 = cinfo->image_width + * a1 = cinfo->max_v_samp_factor + * a2 = compptr->v_samp_factor + * a3 = compptr->width_in_blocks + * 16(sp) = input_data + * 20(sp) = output_data + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + beqz a2, 8f + lw s1, 52(sp) /* s1 = output_data */ + lw s0, 48(sp) /* s0 = input_data */ + + andi t6, a0, 1 /* t6 = temp_index */ + addiu t6, -1 + srl t7, a0, 1 /* t7 = image_width1 */ + srl s4, t7, 2 + andi t8, t7, 3 + andi t9, a0, 2 + srl s2, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 /* s2 = width_in_blocks*DCT */ + srl t7, t0, 1 + subu s2, t7, s2 +0: + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + lw s7, 4(s0) /* s7 = inptr1 */ + li s6, 1 /* s6 = bias */ +2: + ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */ + ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */ + ulw t2, 4(t5) + ulw t3, 4(s7) + precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */ + ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */ + raddu.w.qb t1, t7 + raddu.w.qb t0, t0 + shra_r.w t1, t1, 2 + addiu t0, 1 + srl t0, 2 + precrq.ph.w t7, t2, t3 + ins t2, t3, 16, 16 + raddu.w.qb t7, t7 + raddu.w.qb t2, t2 + shra_r.w t7, t7, 2 + addiu t2, 1 + srl t2, 2 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t7, 3(t4) + addiu t4, 4 + addiu t5, 8 + addiu s4, s4, -1 + bgtz s4, 2b + addiu s7, 8 + beqz t8, 4f + addu t8, t4, t8 +3: + ulhu t0, 0(t5) + ulhu t1, 0(s7) + ins t0, t1, 16, 16 + raddu.w.qb t0, t0 + addu t0, t0, s6 + srl t0, 2 + xori s6, s6, 3 + sb t0, 0(t4) + addiu t5, 2 + addiu t4, 1 + bne t8, t4, 3b + addiu s7, 2 +4: + lbux t1, t6(t5) + sll t1, 1 + lbux t0, t6(s7) + sll t0, 1 + addu t1, t1, t0 + addu t3, t1, s6 + srl t0, t3, 2 /* t2 = pixval1 */ + xori s6, s6, 3 + addu t2, t1, s6 + srl t1, t2, 2 /* t3 = pixval2 */ + blez s2, 6f + append t1, t0, 8 +5: + ush t1, 0(t4) + addiu s2, -1 + bgtz s2, 5b + addiu t4, 2 +6: + beqz t9, 7f + nop + sb t0, 0(t4) +7: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 8 +8: + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_h2v2_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) +/* + * a0 = input_data + * a1 = output_data + * a2 = compptr->v_samp_factor + * a3 = cinfo->max_v_samp_factor + * 16(sp) = cinfo->smoothing_factor + * 20(sp) = compptr->width_in_blocks + * 24(sp) = cinfo->image_width + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s7, 52(sp) /* compptr->width_in_blocks */ + lw s0, 56(sp) /* cinfo->image_width */ + lw s6, 48(sp) /* cinfo->smoothing_factor */ + sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */ + sll v0, s7, 1 + subu v0, v0, s0 + blez v0, 2f + move v1, zero + addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */ +0: + addiu t1, a0, -4 + sll t2, v1, 2 + lwx t1, t2(t1) + move t3, v0 + addu t1, t1, s0 + lbu t2, -1(t1) +1: + addiu t3, t3, -1 + sb t2, 0(t1) + bgtz t3, 1b + addiu t1, t1, 1 + addiu v1, v1, 1 + bne v1, t0, 0b + nop +2: + li v0, 80 + mul v0, s6, v0 + li v1, 16384 + move t4, zero + move t5, zero + subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */ + sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */ +3: +/* Special case for first column: pretend column -1 is same as column 0 */ + sll v0, t4, 2 + lwx t8, v0(a1) /* outptr = output_data[outrow] */ + sll v1, t5, 2 + addiu t9, v1, 4 + addiu s0, v1, -4 + addiu s1, v1, 8 + lwx s2, v1(a0) /* inptr0 = input_data[inrow] */ + lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */ + lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */ + lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, 0(s2) + lbu v1, 2(s2) + lbu t0, 0(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, 0(s0) + lbu t0, 0(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w v0, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + addiu s1, s1, 2 + sb v0, -1(t8) + addiu s4, s7, -2 + and s4, s4, 3 + addu s5, s4, t8 /* end address */ +4: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + sb t2, -1(t8) + bne s5, t8, 4b + addiu s1, s1, 2 + addiu s5, s7, -2 + subu s5, s5, s4 + addu s5, s5, t8 /* end address */ +5: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + lh v1, 2(t9) + addu t0, t0, v0 + lh v0, 2(s2) + addu s3, t0, s3 + lh t0, 2(s0) + lh t1, 2(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 4(s2) + lbu t0, 1(t9) + lbu t1, 4(t9) + sb t2, 0(t8) + raddu.w.qb t3, v0 + lbu v0, 1(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 4(s0) + addu t0, t0, v0 + lbu v0, 1(s0) + addu s3, t0, s3 + lbu t0, 1(s1) + lbu t3, 4(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 4(t9) + addu t0, t0, v0 + lh v0, 4(s2) + addu s3, t0, s3 + lh t0, 4(s0) + lh t1, 4(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 6(s2) + lbu t0, 3(t9) + lbu t1, 6(t9) + sb t2, 1(t8) + raddu.w.qb t3, v0 + lbu v0, 3(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 6(s0) + addu t0, t0, v0 + lbu v0, 3(s0) + addu s3, t0, s3 + lbu t0, 3(s1) + lbu t3, 6(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 6(t9) + addu t0, t0, v0 + lh v0, 6(s2) + addu s3, t0, s3 + lh t0, 6(s0) + lh t1, 6(s1) + madd $ac1, s3, t7 + extr_r.w t3, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 8(s2) + lbu t0, 5(t9) + lbu t1, 8(t9) + sb t3, 2(t8) + raddu.w.qb t2, v0 + lbu v0, 5(s2) + addu t0, t0, t1 + mult $ac1, t2, t6 + addu v0, v0, v1 + lbu t2, 8(s0) + addu t0, t0, v0 + lbu v0, 5(s0) + addu s3, t0, s3 + lbu t0, 5(s1) + lbu t3, 8(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + addiu t8, t8, 4 + addu t0, t0, v0 + addiu s2, s2, 8 + addu s3, t0, s3 + addiu t9, t9, 8 + madd $ac1, s3, t7 + extr_r.w t1, $ac1, 16 + addiu s0, s0, 8 + addiu s1, s1, 8 + bne s5, t8, 5b + sb t1, -1(t8) +/* Special case for last column */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 1(s2) + lbu t0, -1(t9) + lbu t1, 1(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 1(s0) + addu t0, t0, v0 + lbu t3, 1(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t0, $ac1, 16 + addiu t5, t5, 2 + sb t0, 0(t8) + addiu t4, t4, 1 + bne t4, a2, 3b + addiu t5, t5, 2 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_h2v2_smooth_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_int_upsample_dspr2) +/* + * a0 = upsample->h_expand[compptr->component_index] + * a1 = upsample->v_expand[compptr->component_index] + * a2 = input_data + * a3 = output_data_ptr + * 16(sp) = cinfo->output_width + * 20(sp) = cinfo->max_v_samp_factor + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + lw s0, 0(a3) /* s0 = output_data */ + lw s1, 32(sp) /* s1 = cinfo->output_width */ + lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */ + li t6, 0 /* t6 = inrow */ + beqz s2, 10f + li s3, 0 /* s3 = outrow */ +0: + addu t0, a2, t6 + addu t7, s0, s3 + lw t3, 0(t0) /* t3 = inptr */ + lw t8, 0(t7) /* t8 = outptr */ + beqz s1, 4f + addu t5, t8, s1 /* t5 = outend */ +1: + lb t2, 0(t3) /* t2 = invalue = *inptr++ */ + addiu t3, 1 + beqz a0, 3f + move t0, a0 /* t0 = h_expand */ +2: + sb t2, 0(t8) + addiu t0, -1 + bgtz t0, 2b + addiu t8, 1 +3: + bgt t5, t8, 1b + nop +4: + addiu t9, a1, -1 /* t9 = v_expand - 1 */ + blez t9, 9f + nop +5: + lw t3, 0(s0) + lw t4, 4(s0) + subu t0, s1, 0xF + blez t0, 7f + addu t5, t3, s1 /* t5 = end address */ + andi t7, s1, 0xF /* t7 = residual */ + subu t8, t5, t7 +6: + ulw t0, 0(t3) + ulw t1, 4(t3) + ulw t2, 8(t3) + usw t0, 0(t4) + ulw t0, 12(t3) + usw t1, 4(t4) + usw t2, 8(t4) + usw t0, 12(t4) + addiu t3, 16 + bne t3, t8, 6b + addiu t4, 16 + beqz t7, 8f + nop +7: + lbu t0, 0(t3) + sb t0, 0(t4) + addiu t3, 1 + bne t3, t5, 7b + addiu t4, 1 +8: + addiu t9, -1 + bgtz t9, 5b + addiu s0, 8 +9: + addu s3, s3, a1 + bne s3, s2, 0b + addiu t6, 1 +10: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_int_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = cinfo->output_width + * a2 = input_data + * a3 = output_data_ptr + */ + lw t7, 0(a3) /* t7 = output_data */ + andi t8, a1, 0xf /* t8 = residual */ + sll t0, a0, 2 + blez a0, 4f + addu t9, t7, t0 /* t9 = output_data end address */ +0: + lw t5, 0(t7) /* t5 = outptr */ + lw t6, 0(a2) /* t6 = inptr */ + addu t3, t5, a1 /* t3 = outptr + output_width (end address) */ + subu t3, t8 /* t3 = end address - residual */ + beq t5, t3, 2f + move t4, t8 +1: + ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */ + ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */ + srl t1, t0, 16 /* t1 = |X|X|P3|P2| */ + ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */ + ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */ + ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */ + ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */ + usw t0, 0(t5) + usw t1, 4(t5) + srl t0, t2, 16 /* t0 = |X|X|P7|P6| */ + ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */ + ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */ + ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */ + ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */ + usw t2, 8(t5) + usw t0, 12(t5) + addiu t5, 16 + bne t5, t3, 1b + addiu t6, 8 + beqz t8, 3f + move t4, t8 +2: + lbu t1, 0(t6) + sb t1, 0(t5) + sb t1, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + addiu t7, 4 + bne t9, t7, 0b + addiu a2, 4 +4: + j ra + nop +END(jsimd_h2v1_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = cinfo->output_width + * a2 = input_data + * a3 = output_data_ptr + */ + lw t7, 0(a3) + blez a0, 7f + andi t9, a1, 0xf /* t9 = residual */ +0: + lw t6, 0(a2) /* t6 = inptr */ + lw t5, 0(t7) /* t5 = outptr */ + addu t8, t5, a1 /* t8 = outptr end address */ + subu t8, t9 /* t8 = end address - residual */ + beq t5, t8, 2f + move t4, t9 +1: + ulw t0, 0(t6) + srl t1, t0, 16 + ins t0, t0, 16, 16 + ins t0, t0, 8, 16 + ins t1, t1, 16, 16 + ins t1, t1, 8, 16 + ulw t2, 4(t6) + usw t0, 0(t5) + usw t1, 4(t5) + srl t3, t2, 16 + ins t2, t2, 16, 16 + ins t2, t2, 8, 16 + ins t3, t3, 16, 16 + ins t3, t3, 8, 16 + usw t2, 8(t5) + usw t3, 12(t5) + addiu t5, 16 + bne t5, t8, 1b + addiu t6, 8 + beqz t9, 3f + move t4, t9 +2: + lbu t0, 0(t6) + sb t0, 0(t5) + sb t0, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + lw t6, 0(t7) /* t6 = outptr[0] */ + lw t5, 4(t7) /* t5 = outptr[1] */ + addu t4, t6, a1 /* t4 = new end address */ + beq a1, t9, 5f + subu t8, t4, t9 +4: + ulw t0, 0(t6) + ulw t1, 4(t6) + ulw t2, 8(t6) + usw t0, 0(t5) + ulw t0, 12(t6) + usw t1, 4(t5) + usw t2, 8(t5) + usw t0, 12(t5) + addiu t6, 16 + bne t6, t8, 4b + addiu t5, 16 + beqz t9, 6f + nop +5: + lbu t0, 0(t6) + sb t0, 0(t5) + addiu t6, 1 + bne t6, t4, 5b + addiu t5, 1 +6: + addiu t7, 8 + addiu a0, -2 + bgtz a0, 0b + addiu a2, 4 +7: + j ra + nop +END(jsimd_h2v2_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_islow_dspr2) +/* + * a0 = coef_block + * a1 = compptr->dcttable + * a2 = output + * a3 = range_limit + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -256 + move v0, sp + addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */ +1: + lh s4, 32(a0) /* s4 = inptr[16] */ + lh s5, 64(a0) /* s5 = inptr[32] */ + lh s6, 96(a0) /* s6 = inptr[48] */ + lh t1, 112(a0) /* t1 = inptr[56] */ + lh t7, 16(a0) /* t7 = inptr[8] */ + lh t5, 80(a0) /* t5 = inptr[40] */ + lh t3, 48(a0) /* t3 = inptr[24] */ + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, t5 + or s4, s4, t7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 2f + addiu v1, v1, -1 + lh s5, 0(a1) /* quantptr[DCTSIZE*0] */ + lh s6, 0(a0) /* inptr[DCTSIZE*0] */ + mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */ + sll s5, s5, 2 + sw s5, 0(v0) + sw s5, 32(v0) + sw s5, 64(v0) + sw s5, 96(v0) + sw s5, 128(v0) + sw s5, 160(v0) + sw s5, 192(v0) + b 3f + sw s5, 224(v0) +2: + lh t0, 112(a1) + lh t2, 48(a1) + lh t4, 80(a1) + lh t6, 16(a1) + mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7], + quantptr[DCTSIZE*7]) */ + mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3], + quantptr[DCTSIZE*3]) */ + mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5], + quantptr[DCTSIZE*5]) */ + mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1], + quantptr[DCTSIZE*1]) */ + lh t4, 32(a1) + lh t5, 32(a0) + lh t6, 96(a1) + lh t7, 96(a0) + addu s0, t0, t1 /* z3 = tmp0 + tmp2 */ + addu s1, t1, t2 /* z2 = tmp1 + tmp2 */ + addu s2, t2, t3 /* z4 = tmp1 + tmp3 */ + addu s3, s0, s2 /* z3 + z4 */ + addiu t9, zero, 9633 /* FIX_1_175875602 */ + mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu t8, t0, t3 /* z1 = tmp0 + tmp3 */ + addiu t9, zero, 2446 /* FIX_0_298631336 */ + mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t9, zero, 16819 /* FIX_2_053119869 */ + mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t9, zero, 25172 /* FIX_3_072711026 */ + mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t9, zero, 12299 /* FIX_1_501321110 */ + mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + addiu t9, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t9, zero, 3196 /* FIX_0_390180644 */ + mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t9, zero, 7373 /* FIX_0_899976223 */ + mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t9, zero, 20995 /* FIX_2_562915447 */ + mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + subu s0, s3, s0 /* z3 += z5 */ + addu t0, t0, s0 /* tmp0 += z3 */ + addu t1, t1, s0 /* tmp2 += z3 */ + subu s2, s3, s2 /* z4 += z5 */ + addu t2, t2, s2 /* tmp1 += z4 */ + addu t3, t3, s2 /* tmp3 += z4 */ + subu t0, t0, t8 /* tmp0 += z1 */ + subu t1, t1, s1 /* tmp2 += z2 */ + subu t2, t2, s1 /* tmp1 += z2 */ + subu t3, t3, t8 /* tmp3 += z1 */ + mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + addiu t9, zero, 6270 /* FIX_0_765366865 */ + mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ + lh t4, 0(a1) + lh t5, 0(a0) + lh t6, 64(a1) + lh t7, 64(a0) + mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */ + mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + addiu t9, zero, 4433 /* FIX_0_541196100 */ + addu s3, s0, s1 /* z2 + z3 */ + mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t9, zero, 15137 /* FIX_1_847759065 */ + mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */ + addu t4, t5, t6 + subu t5, t5, t6 + sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */ + sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */ + addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t6, s3, t8 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ + addu s0, t4, t7 + subu s1, t4, t7 + addu s2, t5, t6 + subu s3, t5, t6 + addu t4, s0, t3 + subu s0, s0, t3 + addu t3, s2, t1 + subu s2, s2, t1 + addu t1, s3, t2 + subu s3, s3, t2 + addu t2, s1, t0 + subu s1, s1, t0 + shra_r.w t4, t4, 11 + shra_r.w t3, t3, 11 + shra_r.w t1, t1, 11 + shra_r.w t2, t2, 11 + shra_r.w s1, s1, 11 + shra_r.w s3, s3, 11 + shra_r.w s2, s2, 11 + shra_r.w s0, s0, 11 + sw t4, 0(v0) + sw t3, 32(v0) + sw t1, 64(v0) + sw t2, 96(v0) + sw s1, 128(v0) + sw s3, 160(v0) + sw s2, 192(v0) + sw s0, 224(v0) +3: + addiu a1, a1, 2 + addiu a0, a0, 2 + bgtz v1, 1b + addiu v0, v0, 4 + move v0, sp + addiu v1, zero, 8 +4: + lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */ + lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */ + lw t2, 0(v0) /* (JLONG)wsptr[0] */ + lw t3, 16(v0) /* (JLONG)wsptr[4] */ + lw s4, 4(v0) /* (JLONG)wsptr[1] */ + lw s5, 12(v0) /* (JLONG)wsptr[3] */ + lw s6, 20(v0) /* (JLONG)wsptr[5] */ + lw s7, 28(v0) /* (JLONG)wsptr[7] */ + or s4, s4, t0 + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, s7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 5f + addiu v1, v1, -1 + shra_r.w s5, t2, 5 + andi s5, s5, 0x3ff + lbux s5, s5(a3) + lw s1, 0(a2) + replv.qb s5, s5 + usw s5, 0(s1) + usw s5, 4(s1) + b 6f + nop +5: + addu t4, t0, t1 /* z2 + z3 */ + addiu t8, zero, 4433 /* FIX_0_541196100 */ + mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t8, zero, 15137 /* FIX_1_847759065 */ + mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */ + addiu t8, zero, 6270 /* FIX_0_765366865 */ + mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */ + addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */ + subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */ + sll t4, t4, 13 /* tmp0 = + (wsptr[0] + wsptr[4]) << CONST_BITS */ + sll t2, t2, 13 /* tmp1 = + (wsptr[0] - wsptr[4]) << CONST_BITS */ + subu t1, t5, t1 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ + subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */ + addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */ + addu t5, t5, t0 /* tmp3 = + z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */ + addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */ + lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */ + lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */ + lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */ + lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */ + addu s0, t4, t6 /* z3 = tmp0 + tmp2 */ + addiu t8, zero, 9633 /* FIX_1_175875602 */ + addu s1, t5, t7 /* z4 = tmp1 + tmp3 */ + addu s2, s0, s1 /* z3 + z4 */ + mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu s3, t4, t7 /* z1 = tmp0 + tmp3 */ + addu t9, t5, t6 /* z2 = tmp1 + tmp2 */ + addiu t8, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t8, zero, 3196 /* FIX_0_390180644 */ + mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t8, zero, 2446 /* FIX_0_298631336 */ + mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t8, zero, 7373 /* FIX_0_899976223 */ + mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t8, zero, 16819 /* FIX_2_053119869 */ + mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t8, zero, 20995 /* FIX_2_562915447 */ + mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + addiu t8, zero, 25172 /* FIX_3_072711026 */ + mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t8, zero, 12299 /* FIX_1_501321110 */ + mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + subu s0, s2, s0 /* z3 += z5 */ + subu s1, s2, s1 /* z4 += z5 */ + addu t4, t4, s0 + subu t4, t4, s3 /* tmp0 */ + addu t5, t5, s1 + subu t5, t5, t9 /* tmp1 */ + addu t6, t6, s0 + subu t6, t6, t9 /* tmp2 */ + addu t7, t7, s1 + subu t7, t7, s3 /* tmp3 */ + addu s0, t0, t7 + subu t0, t0, t7 + addu t7, t2, t6 + subu t2, t2, t6 + addu t6, t3, t5 + subu t3, t3, t5 + addu t5, t1, t4 + subu t1, t1, t4 + shra_r.w s0, s0, 18 + shra_r.w t7, t7, 18 + shra_r.w t6, t6, 18 + shra_r.w t5, t5, 18 + shra_r.w t1, t1, 18 + shra_r.w t3, t3, 18 + shra_r.w t2, t2, 18 + shra_r.w t0, t0, 18 + andi s0, s0, 0x3ff + andi t7, t7, 0x3ff + andi t6, t6, 0x3ff + andi t5, t5, 0x3ff + andi t1, t1, 0x3ff + andi t3, t3, 0x3ff + andi t2, t2, 0x3ff + andi t0, t0, 0x3ff + lw s1, 0(a2) + lbux s0, s0(a3) + lbux t7, t7(a3) + lbux t6, t6(a3) + lbux t5, t5(a3) + lbux t1, t1(a3) + lbux t3, t3(a3) + lbux t2, t2(a3) + lbux t0, t0(a3) + sb s0, 0(s1) + sb t7, 1(s1) + sb t6, 2(s1) + sb t5, 3(s1) + sb t1, 4(s1) + sb t3, 5(s1) + sb t2, 6(s1) + sb t0, 7(s1) +6: + addiu v0, v0, 32 + bgtz v1, 4b + addiu a2, a2, 4 + addiu sp, sp, 256 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_islow_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) +/* + * a0 = inptr + * a1 = quantptr + * a2 = wsptr + * a3 = mips_idct_ifast_coefs + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu t9, a0, 16 /* end address */ + or AT, a3, zero + +0: + lw s0, 0(a1) /* quantptr[DCTSIZE*0] */ + lw t0, 0(a0) /* inptr[DCTSIZE*0] */ + lw t1, 16(a0) /* inptr[DCTSIZE*1] */ + muleq_s.w.phl v0, t0, s0 /* tmp0 ... */ + lw t2, 32(a0) /* inptr[DCTSIZE*2] */ + lw t3, 48(a0) /* inptr[DCTSIZE*3] */ + lw t4, 64(a0) /* inptr[DCTSIZE*4] */ + lw t5, 80(a0) /* inptr[DCTSIZE*5] */ + muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */ + lw t6, 96(a0) /* inptr[DCTSIZE*6] */ + lw t7, 112(a0) /* inptr[DCTSIZE*7] */ + or s4, t1, t2 + or s5, t3, t4 + bnez s4, 1f + ins t0, v0, 16, 16 /* ... tmp0 */ + bnez s5, 1f + or s6, t5, t6 + or s6, s6, t7 + bnez s6, 1f + sw t0, 0(a2) /* wsptr[DCTSIZE*0] */ + sw t0, 16(a2) /* wsptr[DCTSIZE*1] */ + sw t0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw t0, 48(a2) /* wsptr[DCTSIZE*3] */ + sw t0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw t0, 80(a2) /* wsptr[DCTSIZE*5] */ + sw t0, 96(a2) /* wsptr[DCTSIZE*6] */ + sw t0, 112(a2) /* wsptr[DCTSIZE*7] */ + addiu a0, a0, 4 + b 2f + addiu a1, a1, 4 + +1: + lw s1, 32(a1) /* quantptr[DCTSIZE*2] */ + lw s2, 64(a1) /* quantptr[DCTSIZE*4] */ + muleq_s.w.phl v0, t2, s1 /* tmp1 ... */ + muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */ + lw s0, 16(a1) /* quantptr[DCTSIZE*1] */ + lw s1, 48(a1) /* quantptr[DCTSIZE*3] */ + lw s3, 96(a1) /* quantptr[DCTSIZE*6] */ + muleq_s.w.phl v1, t4, s2 /* tmp2 ... */ + muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */ + lw s2, 80(a1) /* quantptr[DCTSIZE*5] */ + lw t8, 4(AT) /* FIX(1.414213562) */ + ins t2, v0, 16, 16 /* ... tmp1 */ + muleq_s.w.phl v0, t6, s3 /* tmp3 ... */ + muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */ + ins t4, v1, 16, 16 /* ... tmp2 */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + ins t6, v0, 16, 16 /* ... tmp3 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + muleq_s.w.phl v0, t1, s0 /* tmp4 ... */ + muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */ + shll_s.ph s6, s6, 1 /* x2 */ + lw s3, 112(a1) /* quantptr[DCTSIZE*7] */ + subq.ph s6, s6, s7 /* ... tmp12 */ + muleq_s.w.phl v1, t7, s3 /* tmp7 ... */ + muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */ + ins t1, v0, 16, 16 /* ... tmp4 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + muleq_s.w.phl v0, t5, s2 /* tmp6 ... */ + muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */ + ins t7, v1, 16, 16 /* ... tmp7 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + muleq_s.w.phl v1, t3, s1 /* tmp5 ... */ + muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */ + ins t5, v0, 16, 16 /* ... tmp6 */ + ins t3, v1, 16, 16 /* ... tmp5 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ + addq.ph s0, t0, t7 + subq.ph s1, t0, t7 + mulq_s.ph v1, v1, t8 /* ... z5 */ + shll_s.ph s5, s5, 1 /* x2 */ + lw t8, 12(AT) /* FIX(-2.613125930) */ + sw s0, 0(a2) /* wsptr[DCTSIZE*0] */ + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ + addiu a0, a0, 4 + addiu a1, a1, 4 + sw s1, 112(a2) /* wsptr[DCTSIZE*7] */ + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ + addq.ph s2, t2, t5 + addq.ph t1, s4, t3 /* tmp4 */ + subq.ph s3, t2, t5 + sw s2, 16(a2) /* wsptr[DCTSIZE*1] */ + sw s3, 96(a2) /* wsptr[DCTSIZE*6] */ + addq.ph v0, t4, t3 + subq.ph v1, t4, t3 + sw v0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw v1, 80(a2) /* wsptr[DCTSIZE*5] */ + addq.ph v0, t6, t1 + subq.ph v1, t6, t1 + sw v0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw v1, 48(a2) /* wsptr[DCTSIZE*3] */ + +2: + bne a0, t9, 0b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_ifast_cols_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) +/* + * a0 = wsptr + * a1 = output_buf + * a2 = output_col + * a3 = mips_idct_ifast_coefs + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + addiu t9, a0, 128 /* end address */ + lui s8, 0x8080 + ori s8, s8, 0x8080 + +0: + lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */ + lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */ + lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */ + lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */ + lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */ + lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */ + lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */ + lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */ + lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */ + precrq.ph.w t1, s0, t0 /* B b */ + ins t0, s0, 16, 16 /* A a */ + bnez t1, 1f + or s0, t2, s2 + bnez s0, 1f + or s0, t4, s4 + bnez s0, 1f + or s0, t6, s6 + bnez s0, 1f + shll_s.ph s0, t0, 2 /* A a */ + lw a3, 0(a1) + lw AT, 4(a1) + precrq.ph.w t0, s0, s0 /* A A */ + ins s0, s0, 16, 16 /* a a */ + addu a3, a3, a2 + addu AT, AT, a2 + precrq.qb.ph t0, t0, t0 /* A A A A */ + precrq.qb.ph s0, s0, s0 /* a a a a */ + addu.qb s0, s0, s8 + addu.qb t0, t0, s8 + sw s0, 0(a3) + sw s0, 4(a3) + sw t0, 0(AT) + sw t0, 4(AT) + addiu a0, a0, 32 + bne a0, t9, 0b + addiu a1, a1, 8 + b 2f + nop + +1: + precrq.ph.w t3, s2, t2 + ins t2, s2, 16, 16 + precrq.ph.w t5, s4, t4 + ins t4, s4, 16, 16 + precrq.ph.w t7, s6, t6 + ins t6, s6, 16, 16 + lw t8, 4(AT) /* FIX(1.414213562) */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + shll_s.ph s6, s6, 1 /* x2 */ + subq.ph s6, s6, s7 /* ... tmp12 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ + addq.ph s0, t0, t7 /* tmp0 + tmp7 */ + subq.ph s7, t0, t7 /* tmp0 - tmp7 */ + mulq_s.ph v1, v1, t8 /* ... z5 */ + lw a3, 0(a1) + lw t8, 12(AT) /* FIX(-2.613125930) */ + shll_s.ph s5, s5, 1 /* x2 */ + addu a3, a3, a2 + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ + addiu a0, a0, 32 + addiu a1, a1, 8 + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ + shll_s.ph s0, s0, 2 + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ + shll_s.ph s7, s7, 2 + addq.ph t1, s4, t3 /* tmp4 */ + addq.ph s1, t2, t5 /* tmp1 + tmp6 */ + subq.ph s6, t2, t5 /* tmp1 - tmp6 */ + addq.ph s2, t4, t3 /* tmp2 + tmp5 */ + subq.ph s5, t4, t3 /* tmp2 - tmp5 */ + addq.ph s4, t6, t1 /* tmp3 + tmp4 */ + subq.ph s3, t6, t1 /* tmp3 - tmp4 */ + shll_s.ph s1, s1, 2 + shll_s.ph s2, s2, 2 + shll_s.ph s3, s3, 2 + shll_s.ph s4, s4, 2 + shll_s.ph s5, s5, 2 + shll_s.ph s6, s6, 2 + precrq.ph.w t0, s1, s0 /* B A */ + ins s0, s1, 16, 16 /* b a */ + precrq.ph.w t2, s3, s2 /* D C */ + ins s2, s3, 16, 16 /* d c */ + precrq.ph.w t4, s5, s4 /* F E */ + ins s4, s5, 16, 16 /* f e */ + precrq.ph.w t6, s7, s6 /* H G */ + ins s6, s7, 16, 16 /* h g */ + precrq.qb.ph t0, t2, t0 /* D C B A */ + precrq.qb.ph s0, s2, s0 /* d c b a */ + precrq.qb.ph t4, t6, t4 /* H G F E */ + precrq.qb.ph s4, s6, s4 /* h g f e */ + addu.qb s0, s0, s8 + addu.qb s4, s4, s8 + sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */ + sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */ + lw a3, -4(a1) + addu.qb t0, t0, s8 + addu a3, a3, a2 + addu.qb t4, t4, s8 + sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */ + bne a0, t9, 0b + sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */ + +2: + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + j ra + nop + +END(jsimd_idct_ifast_rows_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_fdct_islow_dspr2) +/* + * a0 = data + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lui t0, 6437 + ori t0, 2260 + lui t1, 9633 + ori t1, 11363 + lui t2, 0xd39e + ori t2, 0xe6dc + lui t3, 0xf72d + ori t3, 9633 + lui t4, 2261 + ori t4, 9633 + lui t5, 0xd39e + ori t5, 6437 + lui t6, 9633 + ori t6, 0xd39d + lui t7, 0xe6dc + ori t7, 2260 + lui t8, 4433 + ori t8, 10703 + lui t9, 0xd630 + ori t9, 4433 + li s8, 8 + move a1, a0 +1: + lw s0, 0(a1) /* tmp0 = 1|0 */ + lw s1, 4(a1) /* tmp1 = 3|2 */ + lw s2, 8(a1) /* tmp2 = 5|4 */ + lw s3, 12(a1) /* tmp3 = 7|6 */ + packrl.ph s1, s1, s1 /* tmp1 = 2|3 */ + packrl.ph s3, s3, s3 /* tmp3 = 6|7 */ + subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */ + dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */ + dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */ + dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */ + dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */ + addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */ + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */ + extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */ + addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ + sh s0, 2(a1) + sh s1, 6(a1) + sh s2, 10(a1) + sh s3, 14(a1) + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */ + sra s4, s5, 16 /* tmp4 = t11 */ + addiu a1, a1, 16 + addiu s8, s8, -1 + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + addu s2, s5, s4 /* tmp2 = t10 + t11 */ + subu s3, s5, s4 /* tmp3 = t10 - t11 */ + sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */ + sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */ + sh s2, -16(a1) + sh s3, -8(a1) + sh s0, -12(a1) + bgtz s8, 1b + sh s1, -4(a1) + li t0, 2260 + li t1, 11363 + li t2, 9633 + li t3, 6436 + li t4, 6437 + li t5, 2261 + li t6, 11362 + li t7, 2259 + li t8, 4433 + li t9, 10703 + li a1, 10704 + li s8, 8 + +2: + lh a2, 0(a0) /* 0 */ + lh a3, 16(a0) /* 8 */ + lh v0, 32(a0) /* 16 */ + lh v1, 48(a0) /* 24 */ + lh s4, 64(a0) /* 32 */ + lh s5, 80(a0) /* 40 */ + lh s6, 96(a0) /* 48 */ + lh s7, 112(a0) /* 56 */ + addu s2, v0, s5 /* tmp2 = 16 + 40 */ + subu s5, v0, s5 /* tmp5 = 16 - 40 */ + addu s3, v1, s4 /* tmp3 = 24 + 32 */ + subu s4, v1, s4 /* tmp4 = 24 - 32 */ + addu s0, a2, s7 /* tmp0 = 0 + 56 */ + subu s7, a2, s7 /* tmp7 = 0 - 56 */ + addu s1, a3, s6 /* tmp1 = 8 + 48 */ + subu s6, a3, s6 /* tmp6 = 8 - 48 */ + addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */ + subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */ + addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */ + subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */ + mult s7, t1 /* ac0 = tmp7 * c1 */ + madd s4, t0 /* ac0 += tmp4 * c0 */ + madd s5, t4 /* ac0 += tmp5 * c4 */ + madd s6, t2 /* ac0 += tmp6 * c2 */ + mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */ + msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */ + msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */ + msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */ + mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */ + madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */ + madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */ + msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */ + mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */ + msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */ + madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */ + msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */ + extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */ + extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */ + extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */ + extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */ + addiu s8, s8, -1 + addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */ + subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */ + sh s0, 16(a0) + sh s1, 48(a0) + sh s2, 80(a0) + sh s3, 112(a0) + mult v0, t8 /* ac0 = tmp12 * c8 */ + madd v1, t9 /* ac0 += tmp13 * c9 */ + mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */ + msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */ + addiu a0, a0, 2 + extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */ + extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */ + shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */ + shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */ + sh s4, -2(a0) + sh s5, 62(a0) + sh s6, 30(a0) + bgtz s8, 2b + sh s7, 94(a0) + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + jr ra + nop + +END(jsimd_fdct_islow_dspr2) + + +/**************************************************************************/ +LEAF_DSPR2(jsimd_fdct_ifast_dspr2) +/* + * a0 = data + */ + .set at + + SAVE_REGS_ON_STACK 8, s0, s1 + + li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) | + (334 & 0xffff) */ + li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) | + (139 & 0xffff) */ + li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) | + (98 & 0xffff) */ + li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) | + (181 & 0xffff) */ + + move v0, a0 + addiu v1, v0, 128 /* end address */ + +0: + lw t0, 0(v0) /* tmp0 = 1|0 */ + lw t1, 4(v0) /* tmp1 = 3|2 */ + lw t2, 8(v0) /* tmp2 = 5|4 */ + lw t3, 12(v0) /* tmp3 = 7|6 */ + packrl.ph t1, t1, t1 /* tmp1 = 2|3 */ + packrl.ph t3, t3, t3 /* tmp3 = 6|7 */ + subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */ + addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */ + addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ + sra t4, t8, 16 /* tmp4 = t11 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t9, s1 + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */ + dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */ + precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */ + addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */ + subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */ + extr.w t4, $ac0, 8 + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */ + extr.w t0, $ac1, 8 /* t0 = z5 */ + extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */ + extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */ + extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */ + add t6, t1, t0 /* t6 = z2 */ + add t7, t7, t0 /* t7 = z4 */ + subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */ + addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */ + addq.ph t1, t0, t6 /* t1 = z13 + z2 */ + subq.ph t6, t0, t6 /* t6 = z13 - z2 */ + addq.ph t0, t8, t7 /* t0 = z11 + z4 */ + subq.ph t7, t8, t7 /* t7 = z11 - z4 */ + addq.ph t5, t4, t9 + subq.ph t4, t9, t4 + sh t2, 0(v0) + sh t5, 4(v0) + sh t3, 8(v0) + sh t4, 12(v0) + sh t1, 10(v0) + sh t6, 6(v0) + sh t0, 2(v0) + sh t7, 14(v0) + addiu v0, 16 + bne v1, v0, 0b + nop + move v0, a0 + addiu v1, v0, 16 + +1: + lh t0, 0(v0) /* 0 */ + lh t1, 16(v0) /* 8 */ + lh t2, 32(v0) /* 16 */ + lh t3, 48(v0) /* 24 */ + lh t4, 64(v0) /* 32 */ + lh t5, 80(v0) /* 40 */ + lh t6, 96(v0) /* 48 */ + lh t7, 112(v0) /* 56 */ + add t8, t0, t7 /* t8 = tmp0 */ + sub t7, t0, t7 /* t7 = tmp7 */ + add t0, t1, t6 /* t0 = tmp1 */ + sub t1, t1, t6 /* t1 = tmp6 */ + add t6, t2, t5 /* t6 = tmp2 */ + sub t5, t2, t5 /* t5 = tmp5 */ + add t2, t3, t4 /* t2 = tmp3 */ + sub t3, t3, t4 /* t3 = tmp4 */ + add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */ + sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */ + sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */ + ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */ + add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */ + add s0, t4, t2 /* t8 = tmp10+tmp11 */ + sub t4, t4, t2 /* t4 = tmp10-tmp11 */ + sh s0, 0(v0) + sh t4, 64(v0) + extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13, + FIX_0_707106781) */ + addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */ + subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */ + sh t4, 32(v0) + sh t8, 96(v0) + add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */ + add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */ + add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */ + andi t4, a1, 0xffff + mul s0, t1, t4 + sra s0, s0, 8 /* s0 = z4 = + MULTIPLY(tmp12, FIX_1_306562965) */ + ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */ + mult $0, $0 /* ac0 = 0 */ + mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */ + extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12, + FIX_0_382683433) */ + add t2, t7, t8 /* t2 = tmp7 + z5 */ + sub t7, t7, t8 /* t7 = tmp7 - z5 */ + andi t4, a2, 0xffff + mul t8, t3, t4 + sra t8, t8, 8 /* t8 = z2 = + MULTIPLY(tmp10, FIX_0_541196100) */ + andi t4, s1, 0xffff + mul t6, t0, t4 + sra t6, t6, 8 /* t6 = z3 = + MULTIPLY(tmp11, FIX_0_707106781) */ + add t0, t6, t8 /* t0 = z3 + z2 */ + sub t1, t6, t8 /* t1 = z3 - z2 */ + add t3, t6, s0 /* t3 = z3 + z4 */ + sub t4, t6, s0 /* t4 = z3 - z4 */ + sub t5, t2, t1 /* t5 = dataptr[5] */ + sub t6, t7, t0 /* t6 = dataptr[3] */ + add t3, t2, t3 /* t3 = dataptr[1] */ + add t4, t7, t4 /* t4 = dataptr[7] */ + sh t5, 80(v0) + sh t6, 48(v0) + sh t3, 16(v0) + sh t4, 112(v0) + addiu v0, 2 + bne v0, v1, 1b + nop + + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop +END(jsimd_fdct_ifast_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_quantize_dspr2) +/* + * a0 = coef_block + * a1 = divisors + * a2 = workspace + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2 + + addiu v0, a2, 124 /* v0 = workspace_end */ + lh t0, 0(a2) + lh t1, 0(a1) + lh t2, 128(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t4, 384(a1) + lh t5, 130(a1) + lh t6, 2(a2) + lh t7, 2(a1) + lh t8, 386(a1) + +1: + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + addiu a2, a2, 4 + addiu a1, a1, 4 + add s1, t6, t5 + andi s1, 0xffff + sh v1, 0(a0) + + mul s2, s1, t7 + addiu s1, t8, 16 + srav s2, s2, s1 + mul s2, s2, s0 + lh t0, 0(a2) + lh t1, 0(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t2, 128(a1) + lh t4, 384(a1) + lh t5, 130(a1) + lh t8, 386(a1) + lh t6, 2(a2) + lh t7, 2(a1) + sh s2, 2(a0) + lh t0, 0(a2) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + bne a2, v0, 1b + addiu a0, a0, 4 + + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + sh v1, 0(a0) + add s1, t6, t5 + andi s1, 0xffff + mul s2, s1, t7 + addiu s1, t8, 16 + addiu a2, a2, 4 + addiu a1, a1, 4 + srav s2, s2, s1 + mul s2, s2, s0 + sh s2, 2(a0) + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2 + + j ra + nop + +END(jsimd_quantize_dspr2) + + +#ifndef __mips_soft_float + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_quantize_float_dspr2) +/* + * a0 = coef_block + * a1 = divisors + * a2 = workspace + */ + .set at + + li t1, 0x46800100 /* integer representation 16384.5 */ + mtc1 t1, f0 + li t0, 63 +0: + lwc1 f2, 0(a2) + lwc1 f10, 0(a1) + lwc1 f4, 4(a2) + lwc1 f12, 4(a1) + lwc1 f6, 8(a2) + lwc1 f14, 8(a1) + lwc1 f8, 12(a2) + lwc1 f16, 12(a1) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + lwc1 f10, 16(a1) + lwc1 f12, 20(a1) + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + lwc1 f14, 24(a1) + lwc1 f16, 28(a1) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + lwc1 f2, 16(a2) + lwc1 f4, 20(a2) + lwc1 f6, 24(a2) + lwc1 f8, 28(a2) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + addiu t0, t0, -8 + addiu a2, a2, 32 + addiu a1, a1, 32 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + sh t1, 8(a0) + sh t2, 10(a0) + sh t3, 12(a0) + sh t4, 14(a0) + bgez t0, 0b + addiu a0, a0, 16 + + j ra + nop + +END(jsimd_quantize_float_dspr2) + +#endif + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_2x2_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + addiu sp, sp, -40 + move v0, sp + addiu s2, zero, 29692 + addiu s3, zero, -10426 + addiu s4, zero, 6967 + addiu s5, zero, -5906 + lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */ + lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */ + lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */ + lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */ + mul t4, t5, t0 + lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */ + lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */ + mul t6, t6, t1 + mul t5, t5, t0 + lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */ + lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */ + lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */ + lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */ + mul t7, t7, t2 + mult zero, zero + mul t8, t8, t3 + li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */ + li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */ + ins t6, t5, 16, 16 /* t6 = t5|t6 */ + sll t4, t4, 15 + dpa.w.ph $ac0, t6, s0 + lh t1, 2(a1) + lh t6, 2(a0) + ins t8, t7, 16, 16 /* t8 = t7|t8 */ + dpa.w.ph $ac0, t8, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 18(a1) + lh t6, 18(a0) + lh t2, 50(a1) + lh t7, 50(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 82(a1) + lh t2, 82(a0) + lh t3, 114(a1) + lh t4, 114(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 0(v0) + sw t8, 20(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 6(a1) + lh t6, 6(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 22(a1) + lh t6, 22(a0) + lh t2, 54(a1) + lh t7, 54(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 86(a1) + lh t2, 86(a0) + lh t3, 118(a1) + lh t4, 118(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 4(v0) + sw t8, 24(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 10(a1) + lh t6, 10(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 26(a1) + lh t6, 26(a0) + lh t2, 58(a1) + lh t7, 58(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 90(a1) + lh t2, 90(a0) + lh t3, 122(a1) + lh t4, 122(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 8(v0) + sw t8, 28(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 14(a1) + lh t6, 14(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 30(a1) + lh t6, 30(a0) + lh t2, 62(a1) + lh t7, 62(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 94(a1) + lh t2, 94(a0) + lh t3, 126(a1) + lh t4, 126(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 12(v0) + sw t8, 32(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + lw t9, 0(a2) + lw t3, 0(v0) + lw t7, 4(v0) + lw t1, 8(v0) + addu t9, t9, a3 + sll t3, t3, 15 + subu t8, t4, t0 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + shra_r.w t8, t8, 13 + sw t0, 16(v0) + sw t8, 36(v0) + lw t5, 12(v0) + lw t6, 16(v0) + mult t7, s2 + madd t1, s3 + madd t5, s4 + madd t6, s5 + lw t5, 24(v0) + lw t7, 28(v0) + mflo t0, $ac0 + lw t8, 32(v0) + lw t2, 36(v0) + mult $ac1, t5, s2 + madd $ac1, t7, s3 + madd $ac1, t8, s4 + madd $ac1, t2, s5 + addu t1, t3, t0 + subu t6, t3, t0 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + mflo t4, $ac1 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + lw t0, 20(v0) + sb t1, 0(t9) + sb t6, 1(t9) + sll t0, t0, 15 + lw t9, 4(a2) + addu t1, t0, t4 + subu t6, t0, t4 + addu t9, t9, a3 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + sb t1, 0(t9) + sb t6, 1(t9) + addiu sp, sp, 40 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop + +END(jsimd_idct_2x2_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_4x4_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes) + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw v1, 48(sp) + move t0, a1 + move t1, v1 + li t9, 4 + li s0, 0x2e75f93e + li s1, 0x21f9ba79 + li s2, 0xecc2efb0 + li s3, 0x52031ccd + +0: + lh s6, 32(t0) /* inptr[DCTSIZE*2] */ + lh t6, 32(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 96(t0) /* inptr[DCTSIZE*6] */ + lh t7, 96(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 0(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 0(a0) /* quantptr[0] */ + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 112(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 112(a0) /* quantptr[DCTSIZE*7] */ + lh v0, 80(t0) /* inptr[DCTSIZE*5] */ + lh s5, 80(a0) /* quantptr[DCTSIZE*5] */ + lh s6, 48(a0) /* quantptr[DCTSIZE*3] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s7, 16(a0) /* quantptr[DCTSIZE*1] */ + lh t8, 16(t0) /* inptr[DCTSIZE*1] */ + subu t6, t6, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 48(t0) /* inptr[DCTSIZE*3] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, t6 /* tmp10 = tmp0 + z2 */ + subu t4, t2, t6 /* tmp10 = tmp0 - z2 */ + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, v0, 16, 16 + ins t7, t8, 16, 16 + addiu t9, t9, -1 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo s4, $ac0 + mflo s5, $ac1 + addiu a0, a0, 2 + addiu t1, t1, 4 + addiu t0, t0, 2 + addu t6, t4, s4 + subu t5, t4, s4 + addu s6, t3, s5 + subu s7, t3, s5 + shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ + sw t6, 28(t1) + sw t5, 60(t1) + sw s6, -4(t1) + bgtz t9, 0b + sw s7, 92(t1) + /* second loop three pass */ + li t9, 3 +1: + lh s6, 34(t0) /* inptr[DCTSIZE*2] */ + lh t6, 34(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 98(t0) /* inptr[DCTSIZE*6] */ + lh t7, 98(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 2(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 2(a0) /* quantptr[DCTSIZE*0] */ + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 114(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 114(a0) /* quantptr[DCTSIZE*7] */ + lh s5, 82(a0) /* quantptr[DCTSIZE*5] */ + lh t6, 82(t0) /* inptr[DCTSIZE*5] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s6, 50(a0) /* quantptr[DCTSIZE*3] */ + lh t8, 18(t0) /* inptr[DCTSIZE*1] */ + subu v0, v0, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 50(t0) /* inptr[DCTSIZE*3] */ + lh s7, 18(a0) /* quantptr[DCTSIZE*1] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, v0 /* tmp10 = tmp0 + z2 */ + subu t4, t2, v0 /* tmp10 = tmp0 - z2 */ + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo t5, $ac0 + mflo t6, $ac1 + addiu t9, t9, -1 + addiu t0, t0, 2 + addiu a0, a0, 2 + addiu t1, t1, 4 + addu s5, t4, t5 + subu s4, t4, t5 + addu s6, t3, t6 + subu s7, t3, t6 + shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ + sw s5, 32(t1) + sw s4, 64(t1) + sw s6, 0(t1) + bgtz t9, 1b + sw s7, 96(t1) + move t1, v1 + li s4, 15137 + lw s6, 8(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 24(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 0(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 28(t1) /* wsptr[7] */ + lh t6, 20(t1) /* wsptr[5] */ + lh t7, 12(t1) /* wsptr[3] */ + lh t8, 4(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 0(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + /* 2 */ + li s4, 15137 + lw s6, 40(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 56(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 32(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 60(t1) /* wsptr[7] */ + lh t6, 52(t1) /* wsptr[5] */ + lh t7, 44(t1) /* wsptr[3] */ + lh t8, 36(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, + CONST_BITS-PASS1_BITS+1) */ + sll s4, t9, 2 + lw v0, 4(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + /* 3 */ + li s4, 15137 + lw s6, 72(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 88(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 64(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 92(t1) /* wsptr[7] */ + lh t6, 84(t1) /* wsptr[5] */ + lh t7, 76(t1) /* wsptr[3] */ + lh t8, 68(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 8(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + li s4, 15137 + lw s6, 104(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 120(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 96(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 124(t1) /* wsptr[7] */ + lh t6, 116(t1) /* wsptr[5] */ + lh t7, 108(t1) /* wsptr[3] */ + lh t8, 100(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2; */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2; */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 12(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_idct_4x4_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_6x6_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -144 + move v0, sp + addiu v1, v0, 24 + addiu t9, zero, 5793 + addiu s0, zero, 10033 + addiu s1, zero, 2998 + +1: + lh s2, 0(a0) /* q0 = quantptr[ 0] */ + lh s3, 32(a0) /* q1 = quantptr[16] */ + lh s4, 64(a0) /* q2 = quantptr[32] */ + lh t2, 64(a1) /* tmp2 = inptr[32] */ + lh t1, 32(a1) /* tmp1 = inptr[16] */ + lh t0, 0(a1) /* tmp0 = inptr[ 0] */ + mul t2, t2, s4 /* tmp2 = tmp2 * q2 */ + mul t1, t1, s3 /* tmp1 = tmp1 * q1 */ + mul t0, t0, s2 /* tmp0 = tmp0 * q0 */ + lh t6, 16(a1) /* z1 = inptr[ 8] */ + lh t8, 80(a1) /* z3 = inptr[40] */ + lh t7, 48(a1) /* z2 = inptr[24] */ + lh s2, 16(a0) /* q0 = quantptr[ 8] */ + lh s4, 80(a0) /* q2 = quantptr[40] */ + lh s3, 48(a0) /* q1 = quantptr[24] */ + mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */ + mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + mul t6, t6, s2 /* z1 = z1 * q0 */ + mul t8, t8, s4 /* z3 = z3 * q2 */ + mul t7, t7, s3 /* z2 = z2 * q1 */ + addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */ + sll t2, t2, 1 /* tmp2 = tmp2 << 2 */ + subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */ + subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */ + addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */ + addu t1, t6, t8 /* tmp1 = z1 + z3 */ + mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + subu t2, t6, t8 /* tmp2 = z1 - z3 */ + subu t2, t2, t7 /* tmp2 = tmp2 - z2 */ + sll t2, t2, 2 /* tmp2 = tmp2 << 2 */ + addu t0, t6, t7 /* tmp0 = z1 + z2 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + subu s2, t8, t7 /* q0 = z3 - z2 */ + sll s2, s2, 13 /* q0 = q0 << 13 */ + addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */ + addu t1, s2, t1 /* tmp1 = q0 + tmp1 */ + addu s2, t4, t2 /* q0 = tmp11 + tmp2 */ + subu s3, t4, t2 /* q1 = tmp11 - tmp2 */ + addu t6, t3, t0 /* z1 = tmp10 + tmp0 */ + subu t7, t3, t0 /* z2 = tmp10 - tmp0 */ + addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */ + subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */ + shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */ + shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */ + sw s2, 24(v0) + sw s3, 96(v0) + sw t6, 0(v0) + sw t7, 120(v0) + sw t4, 48(v0) + sw t5, 72(v0) + addiu v0, v0, 4 + addiu a1, a1, 2 + bne v0, v1, 1b + addiu a0, a0, 2 + + /* Pass 2: process 6 rows from work array, store into output array. */ + move v0, sp + addiu v1, v0, 144 + +2: + lw t0, 0(v0) + lw t2, 16(v0) + lw s5, 0(a2) + addiu t0, t0, 16 + sll t0, t0, 13 + mul t3, t2, t9 + lw t6, 4(v0) + lw t8, 20(v0) + lw t7, 12(v0) + addu s5, s5, a3 + addu s6, t6, t8 + mul s6, s6, s1 + addu t1, t0, t3 + subu t4, t0, t3 + subu t4, t4, t3 + lw t3, 8(v0) + mul t0, t3, s0 + addu s7, t6, t7 + sll s7, s7, 13 + addu s7, s6, s7 + subu t2, t8, t7 + sll t2, t2, 13 + addu t2, s6, t2 + subu s6, t6, t7 + subu s6, s6, t8 + sll s6, s6, 13 + addu t3, t1, t0 + subu t5, t1, t0 + addu t6, t3, s7 + subu t3, t3, s7 + addu t7, t4, s6 + subu t4, t4, s6 + addu t8, t5, t2 + subu t5, t5, t2 + shll_s.w t6, t6, 6 + shll_s.w t3, t3, 6 + shll_s.w t7, t7, 6 + shll_s.w t4, t4, 6 + shll_s.w t8, t8, 6 + shll_s.w t5, t5, 6 + sra t6, t6, 24 + addiu t6, t6, 128 + sra t3, t3, 24 + addiu t3, t3, 128 + sb t6, 0(s5) + sra t7, t7, 24 + addiu t7, t7, 128 + sb t3, 5(s5) + sra t4, t4, 24 + addiu t4, t4, 128 + sb t7, 1(s5) + sra t8, t8, 24 + addiu t8, t8, 128 + sb t4, 4(s5) + addiu v0, v0, 24 + sra t5, t5, 24 + addiu t5, t5, 128 + sb t8, 2(s5) + addiu a2, a2, 4 + bne v0, v1, 2b + sb t5, 3(s5) + + addiu sp, sp, 144 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_6x6_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = workspace + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 8 + +1: + /* odd part */ + lh t0, 48(a1) + lh t1, 48(a0) + lh t2, 16(a1) + lh t3, 16(a0) + lh t4, 80(a1) + lh t5, 80(a0) + lh t6, 112(a1) + lh t7, 112(a0) + mul t0, t0, t1 /* z2 */ + mul t1, t2, t3 /* z1 */ + mul t2, t4, t5 /* z3 */ + mul t3, t6, t7 /* z4 */ + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + li t6, 7053 /* FIX(0.860918669) */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t7, t1, t2 /* tmp10 */ + addu t8, t7, t3 /* tmp10 + z4 */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2139 /* FIX(0.261052384) */ + mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + li t7, 2295 /* FIX(0.280143716) */ + mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1-=z4 */ + subu t0, t0, t2 /* z2-=z3 */ + addu t2, t0, t1 /* z1+z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_0_765366865 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t8, t6, t8 /* tmp12 */ + addu t3, t8, t4 /* tmp12 + tmp11 */ + addu t3, t3, t7 /* tmp10 */ + subu t8, t8, t9 /* tmp12 + tmp13 */ + addu s0, t5, s0 + subu t8, t8, s0 /* tmp12 */ + subu t9, t6, t9 + subu s1, s1, t4 + addu t9, t9, s1 /* tmp13 */ + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 /* tmp15 */ + /* even part start */ + lh t4, 64(a1) + lh t5, 64(a0) + lh t7, 32(a1) + lh s0, 32(a0) + lh s1, 0(a1) + lh s2, 0(a0) + lh s3, 96(a1) + lh v0, 96(a0) + mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ + /* odd part end */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* update counter and pointers */ + addiu a3, a3, -1 + addiu a0, a0, 2 + addiu a1, a1, 2 + /* even part rest */ + li s1, 10033 + li s2, 11190 + mul t4, t4, s1 /* z4 */ + mul s1, t5, s2 /* z4 */ + sll t5, t5, 13 /* z1 */ + sll t7, t7, 13 + addiu t7, t7, 1024 /* z3 */ + sll s0, s0, 13 /* z2 */ + addu s2, t7, t4 /* tmp10 */ + subu t4, t7, t4 /* tmp11 */ + subu s3, t5, s0 /* tmp12 */ + addu t2, t7, s3 /* tmp21 */ + subu s3, t7, s3 /* tmp24 */ + addu t7, s1, s0 /* tmp12 */ + addu v0, s2, t7 /* tmp20 */ + subu s2, s2, t7 /* tmp25 */ + subu s1, s1, t5 /* z4 - z1 */ + subu s1, s1, s0 /* tmp12 */ + addu s0, t4, s1 /* tmp22 */ + subu t4, t4, s1 /* tmp23 */ + /* final output stage */ + addu t5, v0, t3 + subu v0, v0, t3 + addu t3, t2, t1 + subu t2, t2, t1 + addu t1, s0, t8 + subu s0, s0, t8 + addu t8, t4, t9 + subu t4, t4, t9 + addu t9, s3, t0 + subu s3, s3, t0 + addu t0, s2, t6 + subu s2, s2, t6 + sra t5, t5, 11 + sra t3, t3, 11 + sra t1, t1, 11 + sra t8, t8, 11 + sra t9, t9, 11 + sra t0, t0, 11 + sra s2, s2, 11 + sra s3, s3, 11 + sra t4, t4, 11 + sra s0, s0, 11 + sra t2, t2, 11 + sra v0, v0, 11 + sw t5, 0(a2) + sw t3, 32(a2) + sw t1, 64(a2) + sw t8, 96(a2) + sw t9, 128(a2) + sw t0, 160(a2) + sw s2, 192(a2) + sw s3, 224(a2) + sw t4, 256(a2) + sw s0, 288(a2) + sw t2, 320(a2) + sw v0, 352(a2) + bgtz a3, 1b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop + +END(jsimd_idct_12x12_pass1_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) +/* + * a0 = workspace + * a1 = output + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 12 + +1: + /* Odd part */ + lw t0, 12(a0) + lw t1, 4(a0) + lw t2, 20(a0) + lw t3, 28(a0) + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t6, t1, t2 /* tmp10 */ + li t7, 2139 /* FIX(0.261052384) */ + mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + addu t6, t6, t3 /* tmp10 + z4 */ + li t8, 7053 /* FIX(0.860918669) */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2295 /* FIX(0.280143716) */ + mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1 -= z4 */ + subu t0, t0, t2 /* z2 -= z3 */ + addu t2, t1, t0 /* z1 + z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_1_847759065 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t3, t6, t7 /* tmp12 */ + addu t7, t3, t4 + addu t7, t7, t8 /* tmp10 */ + subu t3, t3, t9 + subu t3, t3, t5 + subu t3, t3, s0 /* tmp12 */ + subu t9, t6, t9 + subu t9, t9, t4 + addu t9, t9, s1 /* tmp13 */ + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 /* tmp15 */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* even part */ + lw t2, 16(a0) /* z4 */ + lw t4, 8(a0) /* z1 */ + lw t5, 0(a0) /* z3 */ + lw t8, 24(a0) /* z2 */ + li s0, 10033 /* FIX(1.224744871) */ + li s1, 11190 /* FIX(1.366025404) */ + mul t2, t2, s0 /* z4 */ + mul s0, t4, s1 /* z4 */ + addiu t5, t5, 0x10 + sll t5, t5, 13 /* z3 */ + sll t4, t4, 13 /* z1 */ + sll t8, t8, 13 /* z2 */ + subu s1, t4, t8 /* tmp12 */ + addu s2, t5, t2 /* tmp10 */ + subu t2, t5, t2 /* tmp11 */ + addu s3, t5, s1 /* tmp21 */ + subu s1, t5, s1 /* tmp24 */ + addu t5, s0, t8 /* tmp12 */ + addu v0, s2, t5 /* tmp20 */ + subu t5, s2, t5 /* tmp25 */ + subu t4, s0, t4 + subu t4, t4, t8 /* tmp12 */ + addu t8, t2, t4 /* tmp22 */ + subu t2, t2, t4 /* tmp23 */ + /* increment counter and pointers */ + addiu a3, a3, -1 + addiu a0, a0, 32 + /* Final stage */ + addu t4, v0, t7 + subu v0, v0, t7 + addu t7, s3, t1 + subu s3, s3, t1 + addu t1, t8, t3 + subu t8, t8, t3 + addu t3, t2, t9 + subu t2, t2, t9 + addu t9, s1, t0 + subu s1, s1, t0 + addu t0, t5, t6 + subu t5, t5, t6 + sll t4, t4, 4 + sll t7, t7, 4 + sll t1, t1, 4 + sll t3, t3, 4 + sll t9, t9, 4 + sll t0, t0, 4 + sll t5, t5, 4 + sll s1, s1, 4 + sll t2, t2, 4 + sll t8, t8, 4 + sll s3, s3, 4 + sll v0, v0, 4 + shll_s.w t4, t4, 2 + shll_s.w t7, t7, 2 + shll_s.w t1, t1, 2 + shll_s.w t3, t3, 2 + shll_s.w t9, t9, 2 + shll_s.w t0, t0, 2 + shll_s.w t5, t5, 2 + shll_s.w s1, s1, 2 + shll_s.w t2, t2, 2 + shll_s.w t8, t8, 2 + shll_s.w s3, s3, 2 + shll_s.w v0, v0, 2 + srl t4, t4, 24 + srl t7, t7, 24 + srl t1, t1, 24 + srl t3, t3, 24 + srl t9, t9, 24 + srl t0, t0, 24 + srl t5, t5, 24 + srl s1, s1, 24 + srl t2, t2, 24 + srl t8, t8, 24 + srl s3, s3, 24 + srl v0, v0, 24 + lw t6, 0(a1) + addiu t4, t4, 0x80 + addiu t7, t7, 0x80 + addiu t1, t1, 0x80 + addiu t3, t3, 0x80 + addiu t9, t9, 0x80 + addiu t0, t0, 0x80 + addiu t5, t5, 0x80 + addiu s1, s1, 0x80 + addiu t2, t2, 0x80 + addiu t8, t8, 0x80 + addiu s3, s3, 0x80 + addiu v0, v0, 0x80 + sb t4, 0(t6) + sb t7, 1(t6) + sb t1, 2(t6) + sb t3, 3(t6) + sb t9, 4(t6) + sb t0, 5(t6) + sb t5, 6(t6) + sb s1, 7(t6) + sb t2, 8(t6) + sb t8, 9(t6) + sb s3, 10(t6) + sb v0, 11(t6) + bgtz a3, 1b + addiu a1, a1, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + jr ra + nop + +END(jsimd_idct_12x12_pass2_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_convsamp_dspr2) +/* + * a0 = sample_data + * a1 = start_col + * a2 = workspace + */ + lw t0, 0(a0) + li t7, 0xff80ff80 + addu t0, t0, a1 + ulw t1, 0(t0) + ulw t2, 4(t0) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + lw t0, 4(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 0(a2) + usw t4, 4(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 8(a2) + usw t6, 12(a2) + + lw t0, 8(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 16(a2) + usw t4, 20(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 24(a2) + usw t6, 28(a2) + + lw t0, 12(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 32(a2) + usw t4, 36(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 40(a2) + usw t6, 44(a2) + + lw t0, 16(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 48(a2) + usw t4, 52(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 56(a2) + usw t6, 60(a2) + + lw t0, 20(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 64(a2) + usw t4, 68(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 72(a2) + usw t6, 76(a2) + + lw t0, 24(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 80(a2) + usw t4, 84(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 88(a2) + usw t6, 92(a2) + + lw t0, 28(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 96(a2) + usw t4, 100(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 104(a2) + usw t6, 108(a2) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 112(a2) + usw t4, 116(a2) + usw t5, 120(a2) + usw t6, 124(a2) + + j ra + nop + +END(jsimd_convsamp_dspr2) + + +#ifndef __mips_soft_float + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_convsamp_float_dspr2) +/* + * a0 = sample_data + * a1 = start_col + * a2 = workspace + */ + .set at + + lw t0, 0(a0) + addu t0, t0, a1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 4(a0) + swc1 f2, 0(a2) + swc1 f4, 4(a2) + swc1 f6, 8(a2) + addu t0, t0, a1 + swc1 f8, 12(a2) + swc1 f10, 16(a2) + swc1 f12, 20(a2) + swc1 f14, 24(a2) + swc1 f16, 28(a2) + /* elemr 1 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 8(a0) + swc1 f2, 32(a2) + swc1 f4, 36(a2) + swc1 f6, 40(a2) + addu t0, t0, a1 + swc1 f8, 44(a2) + swc1 f10, 48(a2) + swc1 f12, 52(a2) + swc1 f14, 56(a2) + swc1 f16, 60(a2) + /* elemr 2 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 12(a0) + swc1 f2, 64(a2) + swc1 f4, 68(a2) + swc1 f6, 72(a2) + addu t0, t0, a1 + swc1 f8, 76(a2) + swc1 f10, 80(a2) + swc1 f12, 84(a2) + swc1 f14, 88(a2) + swc1 f16, 92(a2) + /* elemr 3 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 16(a0) + swc1 f2, 96(a2) + swc1 f4, 100(a2) + swc1 f6, 104(a2) + addu t0, t0, a1 + swc1 f8, 108(a2) + swc1 f10, 112(a2) + swc1 f12, 116(a2) + swc1 f14, 120(a2) + swc1 f16, 124(a2) + /* elemr 4 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 20(a0) + swc1 f2, 128(a2) + swc1 f4, 132(a2) + swc1 f6, 136(a2) + addu t0, t0, a1 + swc1 f8, 140(a2) + swc1 f10, 144(a2) + swc1 f12, 148(a2) + swc1 f14, 152(a2) + swc1 f16, 156(a2) + /* elemr 5 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 24(a0) + swc1 f2, 160(a2) + swc1 f4, 164(a2) + swc1 f6, 168(a2) + addu t0, t0, a1 + swc1 f8, 172(a2) + swc1 f10, 176(a2) + swc1 f12, 180(a2) + swc1 f14, 184(a2) + swc1 f16, 188(a2) + /* elemr 6 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 28(a0) + swc1 f2, 192(a2) + swc1 f4, 196(a2) + swc1 f6, 200(a2) + addu t0, t0, a1 + swc1 f8, 204(a2) + swc1 f10, 208(a2) + swc1 f12, 212(a2) + swc1 f14, 216(a2) + swc1 f16, 220(a2) + /* elemr 7 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + swc1 f2, 224(a2) + swc1 f4, 228(a2) + swc1 f6, 232(a2) + swc1 f8, 236(a2) + swc1 f10, 240(a2) + swc1 f12, 244(a2) + swc1 f14, 248(a2) + swc1 f16, 252(a2) + + j ra + nop + +END(jsimd_convsamp_float_dspr2) + +#endif + +/*****************************************************************************/ diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h new file mode 100644 index 0000000000..12cfda486c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h @@ -0,0 +1,292 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * Copyright (C) 2018, Matthieu Darbois. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +#ifdef __ELF__ +#define HIDDEN_SYMBOL(symbol) .hidden symbol; +#else +#define HIDDEN_SYMBOL(symbol) +#endif + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + HIDDEN_SYMBOL(symbol) \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: \ + .frame sp, 0, ra; \ + .set push; \ + .set arch = mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function, .-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 +.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." +.endif +.if \stack_offset != 0 + addiu sp, sp, -\stack_offset +.endif + sw \r1, 0(sp) +.if \r2 != 0 + sw \r2, 4(sp) +.endif +.if \r3 != 0 + sw \r3, 8(sp) +.endif +.if \r4 != 0 + sw \r4, 12(sp) +.endif +.if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) +.endif +.if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) +.endif +.if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) +.endif +.if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) +.endif +.if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) +.endif +.if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) +.endif +.if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) +.endif +.if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) +.endif +.if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) +.endif +.if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) +.endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 +.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." +.endif + lw \r1, 0(sp) +.if \r2 != 0 + lw \r2, 4(sp) +.endif +.if \r3 != 0 + lw \r3, 8(sp) +.endif +.if \r4 != 0 + lw \r4, 12(sp) +.endif +.if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) +.endif +.if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) +.endif +.if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) +.endif +.if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) +.endif +.if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) +.endif +.if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) +.endif +.if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) +.endif +.if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) +.endif +.if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) +.endif +.if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) +.endif +.if \stack_offset != 0 + addiu sp, sp, \stack_offset +.endif +.endm diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c new file mode 100644 index 0000000000..558eb2ab10 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c @@ -0,0 +1,455 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * SunZhangzhi + * CaiWanwei + * ZhangLixia + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-mmi.c */ + + +#if RGB_RED == 0 +#define mmA re +#define mmB ro +#elif RGB_GREEN == 0 +#define mmA ge +#define mmB go +#elif RGB_BLUE == 0 +#define mmA be +#define mmB bo +#else +#define mmA xe +#define mmB xo +#endif + +#if RGB_RED == 1 +#define mmC re +#define mmD ro +#elif RGB_GREEN == 1 +#define mmC ge +#define mmD go +#elif RGB_BLUE == 1 +#define mmC be +#define mmD bo +#else +#define mmC xe +#define mmD xo +#endif + +#if RGB_RED == 2 +#define mmE re +#define mmF ro +#elif RGB_GREEN == 2 +#define mmE ge +#define mmF go +#elif RGB_BLUE == 2 +#define mmE be +#define mmF bo +#else +#define mmE xe +#define mmF xo +#endif + +#if RGB_RED == 3 +#define mmG re +#define mmH ro +#elif RGB_GREEN == 3 +#define mmG ge +#define mmH go +#elif RGB_BLUE == 3 +#define mmG be +#define mmH bo +#else +#define mmG xe +#define mmH xo +#endif + + +void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int num_cols, col; + __m64 re, ro, ge, go, be, bo, xe; +#if RGB_PIXELSIZE == 4 + __m64 xo; +#endif + __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho; + __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho; + __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho; + __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye; + __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y; + __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb; + __m64 crle, crhe, cre, crlo, crho, cro, cr; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = image_width; num_cols > 0; num_cols -= 8, + outptr0 += 8, outptr1 += 8, outptr2 += 8) { + +#if RGB_PIXELSIZE == 3 + + if (num_cols < 8) { + col = num_cols * 3; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %3\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + "xor $12, $12, $12\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lbu $12, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + "xor $11, $11, $11\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lhu $11, 0($13)\r\n" + "sll $12, $12, 16\r\n" + "or $12, $12, $11\r\n" + + "2: \r\n" + "dmtc1 $12, %0\r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "subu $9, $9, 4\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lwu $14, 0($13)\r\n" + "dmtc1 $14, %1\r\n" + "dsll32 $12, $12, 0\r\n" + "or $12, $12, $14\r\n" + "dmtc1 $12, %0\r\n" + + "3: \r\n" + "li $8, 8\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 4f\r\n" + "nop \r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "li $9, 8\r\n" + "j 5f\r\n" + "nop \r\n" + + "4: \r\n" + "li $8, 16\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 5f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "5: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmG), "=f" (mmF) + : "r" (col), "r" (num_rows), "r" (inptr) + : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", + "$14", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmG = _mm_load_si64((__m64 *)&inptr[8]); + mmF = _mm_load_si64((__m64 *)&inptr[16]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmG = _mm_loadu_si64((__m64 *)&inptr[8]); + mmF = _mm_loadu_si64((__m64 *)&inptr[16]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT); + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmG); + mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); + + mmD = _mm_unpacklo_pi8(mmD, mmF); + mmG = _mm_unpackhi_pi8(mmG, mmF); + + mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT); + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmD); + mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); + + mmE = _mm_unpacklo_pi8(mmE, mmG); + mmD = _mm_unpackhi_pi8(mmD, mmG); + mmC = _mm_loadhi_pi8_f(mmA); + mmA = _mm_loadlo_pi8_f(mmA); + + mmB = _mm_loadhi_pi8_f(mmE); + mmE = _mm_loadlo_pi8_f(mmE); + + mmF = _mm_loadhi_pi8_f(mmD); + mmD = _mm_loadlo_pi8_f(mmD); + +#else /* RGB_PIXELSIZE == 4 */ + + if (num_cols < 8) { + col = num_cols; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + PTR_SLL "$11, $9, 2\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $11\r\n" + "lwc1 %0, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + PTR_SLL "$11, $9, 2\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $11\r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0($13)\r\n" + + "2: \r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "mov.s %3, %1\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "3: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) + : "r" (col), "r" (inptr) + : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmF = _mm_load_si64((__m64 *)&inptr[8]); + mmD = _mm_load_si64((__m64 *)&inptr[16]); + mmC = _mm_load_si64((__m64 *)&inptr[24]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmF = _mm_loadu_si64((__m64 *)&inptr[8]); + mmD = _mm_loadu_si64((__m64 *)&inptr[16]); + mmC = _mm_loadu_si64((__m64 *)&inptr[24]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmB = _mm_unpackhi_pi8(mmA, mmF); + mmA = _mm_unpacklo_pi8(mmA, mmF); + + mmG = _mm_unpackhi_pi8(mmD, mmC); + mmD = _mm_unpacklo_pi8(mmD, mmC); + + mmE = _mm_unpackhi_pi16(mmA, mmD); + mmA = _mm_unpacklo_pi16(mmA, mmD); + + mmH = _mm_unpackhi_pi16(mmB, mmG); + mmB = _mm_unpacklo_pi16(mmB, mmG); + + mmC = _mm_loadhi_pi8_f(mmA); + mmA = _mm_loadlo_pi8_f(mmA); + + mmD = _mm_loadhi_pi8_f(mmB); + mmB = _mm_loadlo_pi8_f(mmB); + + mmG = _mm_loadhi_pi8_f(mmE); + mmE = _mm_loadlo_pi8_f(mmE); + + mmF = _mm_unpacklo_pi8(mmH, mmH); + mmH = _mm_unpackhi_pi8(mmH, mmH); + mmF = _mm_srli_pi16(mmF, BYTE_BIT); + mmH = _mm_srli_pi16(mmH, BYTE_BIT); + +#endif + + /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) + * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) + * + * (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + rglo = _mm_unpacklo_pi16(ro, go); + rgho = _mm_unpackhi_pi16(ro, go); + ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337); + yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337); + cblo = _mm_madd_pi16(rglo, PW_MF016_MF033); + cbho = _mm_madd_pi16(rgho, PW_MF016_MF033); + + blo = _mm_loadlo_pi16_f(bo); + bho = _mm_loadhi_pi16_f(bo); + halfblo = _mm_srli_pi32(blo, 1); + halfbho = _mm_srli_pi32(bho, 1); + + cblo = _mm_add_pi32(cblo, halfblo); + cbho = _mm_add_pi32(cbho, halfbho); + cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ); + cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ); + cblo = _mm_srli_pi32(cblo, SCALEBITS); + cbho = _mm_srli_pi32(cbho, SCALEBITS); + cbo = _mm_packs_pi32(cblo, cbho); + + rgle = _mm_unpacklo_pi16(re, ge); + rghe = _mm_unpackhi_pi16(re, ge); + yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337); + yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337); + cble = _mm_madd_pi16(rgle, PW_MF016_MF033); + cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033); + + ble = _mm_loadlo_pi16_f(be); + bhe = _mm_loadhi_pi16_f(be); + halfble = _mm_srli_pi32(ble, 1); + halfbhe = _mm_srli_pi32(bhe, 1); + + cble = _mm_add_pi32(cble, halfble); + cbhe = _mm_add_pi32(cbhe, halfbhe); + cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ); + cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ); + cble = _mm_srli_pi32(cble, SCALEBITS); + cbhe = _mm_srli_pi32(cbhe, SCALEBITS); + cbe = _mm_packs_pi32(cble, cbhe); + + cbo = _mm_slli_pi16(cbo, BYTE_BIT); + cb = _mm_or_si64(cbe, cbo); + + bglo = _mm_unpacklo_pi16(bo, go); + bgho = _mm_unpackhi_pi16(bo, go); + ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250); + yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250); + crlo = _mm_madd_pi16(bglo, PW_MF008_MF041); + crho = _mm_madd_pi16(bgho, PW_MF008_MF041); + + ylo = _mm_add_pi32(ylo_bg, ylo_rg); + yho = _mm_add_pi32(yho_bg, yho_rg); + ylo = _mm_add_pi32(ylo, PD_ONEHALF); + yho = _mm_add_pi32(yho, PD_ONEHALF); + ylo = _mm_srli_pi32(ylo, SCALEBITS); + yho = _mm_srli_pi32(yho, SCALEBITS); + yo = _mm_packs_pi32(ylo, yho); + + rlo = _mm_loadlo_pi16_f(ro); + rho = _mm_loadhi_pi16_f(ro); + halfrlo = _mm_srli_pi32(rlo, 1); + halfrho = _mm_srli_pi32(rho, 1); + + crlo = _mm_add_pi32(crlo, halfrlo); + crho = _mm_add_pi32(crho, halfrho); + crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ); + crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ); + crlo = _mm_srli_pi32(crlo, SCALEBITS); + crho = _mm_srli_pi32(crho, SCALEBITS); + cro = _mm_packs_pi32(crlo, crho); + + bgle = _mm_unpacklo_pi16(be, ge); + bghe = _mm_unpackhi_pi16(be, ge); + yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250); + yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250); + crle = _mm_madd_pi16(bgle, PW_MF008_MF041); + crhe = _mm_madd_pi16(bghe, PW_MF008_MF041); + + yle = _mm_add_pi32(yle_bg, yle_rg); + yhe = _mm_add_pi32(yhe_bg, yhe_rg); + yle = _mm_add_pi32(yle, PD_ONEHALF); + yhe = _mm_add_pi32(yhe, PD_ONEHALF); + yle = _mm_srli_pi32(yle, SCALEBITS); + yhe = _mm_srli_pi32(yhe, SCALEBITS); + ye = _mm_packs_pi32(yle, yhe); + + yo = _mm_slli_pi16(yo, BYTE_BIT); + y = _mm_or_si64(ye, yo); + + rle = _mm_loadlo_pi16_f(re); + rhe = _mm_loadhi_pi16_f(re); + halfrle = _mm_srli_pi32(rle, 1); + halfrhe = _mm_srli_pi32(rhe, 1); + + crle = _mm_add_pi32(crle, halfrle); + crhe = _mm_add_pi32(crhe, halfrhe); + crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ); + crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ); + crle = _mm_srli_pi32(crle, SCALEBITS); + crhe = _mm_srli_pi32(crhe, SCALEBITS); + cre = _mm_packs_pi32(crle, crhe); + + cro = _mm_slli_pi16(cro, BYTE_BIT); + cr = _mm_or_si64(cre, cro); + + _mm_store_si64((__m64 *)&outptr0[0], y); + _mm_store_si64((__m64 *)&outptr1[0], cb); + _mm_store_si64((__m64 *)&outptr2[0], cr); + } + } +} + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c new file mode 100644 index 0000000000..93ef5c79f7 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c @@ -0,0 +1,148 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_081 ((short)5329) /* FIX(0.08131) */ +#define F_0_114 ((short)7471) /* FIX(0.11400) */ +#define F_0_168 ((short)11059) /* FIX(0.16874) */ +#define F_0_250 ((short)16384) /* FIX(0.25000) */ +#define F_0_299 ((short)19595) /* FIX(0.29900) */ +#define F_0_331 ((short)21709) /* FIX(0.33126) */ +#define F_0_418 ((short)27439) /* FIX(0.41869) */ +#define F_0_587 ((short)38470) /* FIX(0.58700) */ +#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ + +enum const_index { + index_PD_ONEHALF, + index_PW_F0299_F0337, + index_PW_F0114_F0250, + index_PW_MF016_MF033, + index_PW_MF008_MF041, + index_PD_ONEHALFM1_CJ +}; + +static uint64_t const_value[] = { + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))), + _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299), + _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114), + _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168), + _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081), + _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)), + ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS))) +}; + +#define get_const_value(index) (*(__m64 *)&const_value[index]) + +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) +#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337) +#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250) +#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033) +#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041) +#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ) + + +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c new file mode 100644 index 0000000000..9c7b833f2e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c @@ -0,0 +1,132 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhangLixia + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> GRAYSCALE CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_114 ((short)7471) /* FIX(0.11400) */ +#define F_0_250 ((short)16384) /* FIX(0.25000) */ +#define F_0_299 ((short)19595) /* FIX(0.29900) */ +#define F_0_587 ((short)38470) /* FIX(0.58700) */ +#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ + +enum const_index { + index_PD_ONEHALF, + index_PW_F0299_F0337, + index_PW_F0114_F0250 +}; + +static uint64_t const_value[] = { + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))), + _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299), + _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114) +}; + +#define get_const_value(index) (*(__m64 *)&const_value[index]) + +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) +#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337) +#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250) + + +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extrgb_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extrgbx_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extbgr_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extbgrx_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extxbgr_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_gray_convert_mmi jsimd_extxrgb_gray_convert_mmi +#include "jcgryext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_mmi diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c new file mode 100644 index 0000000000..08a83d6699 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c @@ -0,0 +1,374 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhangLixia + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-mmi.c */ + + +#if RGB_RED == 0 +#define mmA re +#define mmB ro +#elif RGB_GREEN == 0 +#define mmA ge +#define mmB go +#elif RGB_BLUE == 0 +#define mmA be +#define mmB bo +#else +#define mmA xe +#define mmB xo +#endif + +#if RGB_RED == 1 +#define mmC re +#define mmD ro +#elif RGB_GREEN == 1 +#define mmC ge +#define mmD go +#elif RGB_BLUE == 1 +#define mmC be +#define mmD bo +#else +#define mmC xe +#define mmD xo +#endif + +#if RGB_RED == 2 +#define mmE re +#define mmF ro +#elif RGB_GREEN == 2 +#define mmE ge +#define mmF go +#elif RGB_BLUE == 2 +#define mmE be +#define mmF bo +#else +#define mmE xe +#define mmF xo +#endif + +#if RGB_RED == 3 +#define mmG re +#define mmH ro +#elif RGB_GREEN == 3 +#define mmG ge +#define mmH go +#elif RGB_BLUE == 3 +#define mmG be +#define mmH bo +#else +#define mmG xe +#define mmH xo +#endif + + +void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + JSAMPROW inptr, outptr; + int num_cols, col; + __m64 re, ro, ge, go, be, bo, xe; +#if RGB_PIXELSIZE == 4 + __m64 xo; +#endif + __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho; + __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye; + __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + for (num_cols = image_width; num_cols > 0; num_cols -= 8, + outptr += 8) { + +#if RGB_PIXELSIZE == 3 + + if (num_cols < 8) { + col = num_cols * 3; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %3\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + "xor $12, $12, $12\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lbu $12, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + "xor $11, $11, $11\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lhu $11, 0($13)\r\n" + "sll $12, $12, 16\r\n" + "or $12, $12, $11\r\n" + + "2: \r\n" + "dmtc1 $12, %0\r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "subu $9, $9, 4\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $9\r\n" + "lwu $14, 0($13)\r\n" + "dmtc1 $14, %1\r\n" + "dsll32 $12, $12, 0\r\n" + "or $12, $12, $14\r\n" + "dmtc1 $12, %0\r\n" + + "3: \r\n" + "li $8, 8\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 4f\r\n" + "nop \r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "li $9, 8\r\n" + "j 5f\r\n" + "nop \r\n" + + "4: \r\n" + "li $8, 16\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 5f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "5: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmG), "=f" (mmF) + : "r" (col), "r" (num_rows), "r" (inptr) + : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", + "$14", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmG = _mm_load_si64((__m64 *)&inptr[8]); + mmF = _mm_load_si64((__m64 *)&inptr[16]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmG = _mm_loadu_si64((__m64 *)&inptr[8]); + mmF = _mm_loadu_si64((__m64 *)&inptr[16]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT); + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmG); + mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); + + mmD = _mm_unpacklo_pi8(mmD, mmF); + mmG = _mm_unpackhi_pi8(mmG, mmF); + + mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT); + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmD); + mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); + + mmE = _mm_unpacklo_pi8(mmE, mmG); + mmD = _mm_unpackhi_pi8(mmD, mmG); + mmC = _mm_loadhi_pi8_f(mmA); + mmA = _mm_loadlo_pi8_f(mmA); + + mmB = _mm_loadhi_pi8_f(mmE); + mmE = _mm_loadlo_pi8_f(mmE); + + mmF = _mm_loadhi_pi8_f(mmD); + mmD = _mm_loadlo_pi8_f(mmD); + +#else /* RGB_PIXELSIZE == 4 */ + + if (num_cols < 8) { + col = num_cols; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + PTR_SLL "$11, $9, 2\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $11\r\n" + "lwc1 %0, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + PTR_SLL "$11, $9, 2\r\n" + "move $13, %5\r\n" + PTR_ADDU "$13, $13, $11\r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0($13)\r\n" + + "2: \r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "mov.s %3, %1\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "3: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) + : "r" (col), "r" (inptr) + : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmF = _mm_load_si64((__m64 *)&inptr[8]); + mmD = _mm_load_si64((__m64 *)&inptr[16]); + mmC = _mm_load_si64((__m64 *)&inptr[24]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmF = _mm_loadu_si64((__m64 *)&inptr[8]); + mmD = _mm_loadu_si64((__m64 *)&inptr[16]); + mmC = _mm_loadu_si64((__m64 *)&inptr[24]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmB = _mm_unpackhi_pi8(mmA, mmF); + mmA = _mm_unpacklo_pi8(mmA, mmF); + + mmG = _mm_unpackhi_pi8(mmD, mmC); + mmD = _mm_unpacklo_pi8(mmD, mmC); + + mmE = _mm_unpackhi_pi16(mmA, mmD); + mmA = _mm_unpacklo_pi16(mmA, mmD); + + mmH = _mm_unpackhi_pi16(mmB, mmG); + mmB = _mm_unpacklo_pi16(mmB, mmG); + + mmC = _mm_loadhi_pi8_f(mmA); + mmA = _mm_loadlo_pi8_f(mmA); + + mmD = _mm_loadhi_pi8_f(mmB); + mmB = _mm_loadlo_pi8_f(mmB); + + mmG = _mm_loadhi_pi8_f(mmE); + mmE = _mm_loadlo_pi8_f(mmE); + + mmF = _mm_unpacklo_pi8(mmH, mmH); + mmH = _mm_unpackhi_pi8(mmH, mmH); + mmF = _mm_srli_pi16(mmF, BYTE_BIT); + mmH = _mm_srli_pi16(mmH, BYTE_BIT); + +#endif + + /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) + * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) + * + * (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + */ + + rglo = _mm_unpacklo_pi16(ro, go); + rgho = _mm_unpackhi_pi16(ro, go); + ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337); + yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337); + + rgle = _mm_unpacklo_pi16(re, ge); + rghe = _mm_unpackhi_pi16(re, ge); + yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337); + yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337); + + bglo = _mm_unpacklo_pi16(bo, go); + bgho = _mm_unpackhi_pi16(bo, go); + ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250); + yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250); + + ylo = _mm_add_pi32(ylo_bg, ylo_rg); + yho = _mm_add_pi32(yho_bg, yho_rg); + ylo = _mm_add_pi32(ylo, PD_ONEHALF); + yho = _mm_add_pi32(yho, PD_ONEHALF); + ylo = _mm_srli_pi32(ylo, SCALEBITS); + yho = _mm_srli_pi32(yho, SCALEBITS); + yo = _mm_packs_pi32(ylo, yho); + + bgle = _mm_unpacklo_pi16(be, ge); + bghe = _mm_unpackhi_pi16(be, ge); + yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250); + yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250); + + yle = _mm_add_pi32(yle_bg, yle_rg); + yhe = _mm_add_pi32(yhe_bg, yhe_rg); + yle = _mm_add_pi32(yle, PD_ONEHALF); + yhe = _mm_add_pi32(yhe, PD_ONEHALF); + yle = _mm_srli_pi32(yle, SCALEBITS); + yhe = _mm_srli_pi32(yhe, SCALEBITS); + ye = _mm_packs_pi32(yle, yhe); + + yo = _mm_slli_pi16(yo, BYTE_BIT); + y = _mm_or_si64(ye, yo); + + _mm_store_si64((__m64 *)&outptr[0], y); + } + } +} + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c new file mode 100644 index 0000000000..0354dac087 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c @@ -0,0 +1,98 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_mmi.h" +#include "jcsample.h" + + +void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + __m64 bias, mask = 0.0, thisavg, nextavg, avg; + __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum; + __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */ + /* bias={1, 2, 1, 2} (16-bit) */ + mask = _mm_cmpeq_pi16(mask, mask); + mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) { + + this0 = _mm_load_si64((__m64 *)&inptr0[0]); + this1 = _mm_load_si64((__m64 *)&inptr1[0]); + next0 = _mm_load_si64((__m64 *)&inptr0[8]); + next1 = _mm_load_si64((__m64 *)&inptr1[8]); + + this0o = _mm_and_si64(this0, mask); + this0e = _mm_srli_pi16(this0, BYTE_BIT); + this1o = _mm_and_si64(this1, mask); + this1e = _mm_srli_pi16(this1, BYTE_BIT); + this0sum = _mm_add_pi16(this0o, this0e); + this1sum = _mm_add_pi16(this1o, this1e); + + next0o = _mm_and_si64(next0, mask); + next0e = _mm_srli_pi16(next0, BYTE_BIT); + next1o = _mm_and_si64(next1, mask); + next1e = _mm_srli_pi16(next1, BYTE_BIT); + next0sum = _mm_add_pi16(next0o, next0e); + next1sum = _mm_add_pi16(next1o, next1e); + + thisavg = _mm_add_pi16(this0sum, this1sum); + nextavg = _mm_add_pi16(next0sum, next1sum); + thisavg = _mm_add_pi16(thisavg, bias); + nextavg = _mm_add_pi16(nextavg, bias); + thisavg = _mm_srli_pi16(thisavg, 2); + nextavg = _mm_srli_pi16(nextavg, 2); + + avg = _mm_packs_pu16(thisavg, nextavg); + + _mm_store_si64((__m64 *)&outptr[0], avg); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h new file mode 100644 index 0000000000..bd07fcc4ed --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, + JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int)(output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c new file mode 100644 index 0000000000..3b5b2f2030 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c @@ -0,0 +1,415 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * SunZhangzhi + * CaiWanwei + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-mmi.c */ + + +#if RGB_RED == 0 +#define mmA re +#define mmB ro +#elif RGB_GREEN == 0 +#define mmA ge +#define mmB go +#elif RGB_BLUE == 0 +#define mmA be +#define mmB bo +#else +#define mmA xe +#define mmB xo +#endif + +#if RGB_RED == 1 +#define mmC re +#define mmD ro +#elif RGB_GREEN == 1 +#define mmC ge +#define mmD go +#elif RGB_BLUE == 1 +#define mmC be +#define mmD bo +#else +#define mmC xe +#define mmD xo +#endif + +#if RGB_RED == 2 +#define mmE re +#define mmF ro +#elif RGB_GREEN == 2 +#define mmE ge +#define mmF go +#elif RGB_BLUE == 2 +#define mmE be +#define mmF bo +#else +#define mmE xe +#define mmF xo +#endif + +#if RGB_RED == 3 +#define mmG re +#define mmH ro +#elif RGB_GREEN == 3 +#define mmG ge +#define mmH go +#elif RGB_BLUE == 3 +#define mmG be +#define mmH bo +#else +#define mmG xe +#define mmH xo +#endif + + +void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int num_cols, col; + __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr; + __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0; + __m64 decenter, mask; + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = out_width; num_cols > 0; num_cols -= 8, + inptr0 += 8, inptr1 += 8, inptr2 += 8) { + + cb = _mm_load_si64((__m64 *)inptr1); + cr = _mm_load_si64((__m64 *)inptr2); + y = _mm_load_si64((__m64 *)inptr0); + + mask = decenter = 0.0; + mask = _mm_cmpeq_pi16(mask, mask); + decenter = _mm_cmpeq_pi16(decenter, decenter); + mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ + decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + + cbe = _mm_and_si64(mask, cb); /* Cb(0246) */ + cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */ + cre = _mm_and_si64(mask, cr); /* Cr(0246) */ + cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */ + cbe = _mm_add_pi16(cbe, decenter); + cbo = _mm_add_pi16(cbo, decenter); + cre = _mm_add_pi16(cre, decenter); + cro = _mm_add_pi16(cro, decenter); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + + cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */ + cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */ + cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */ + cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */ + + be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */ + bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */ + re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */ + ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */ + + be = _mm_add_pi16(be, PW_ONE); + bo = _mm_add_pi16(bo, PW_ONE); + be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */ + bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */ + re = _mm_add_pi16(re, PW_ONE); + ro = _mm_add_pi16(ro, PW_ONE); + re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */ + ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */ + + be = _mm_add_pi16(be, cbe); + bo = _mm_add_pi16(bo, cbo); + be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */ + bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */ + re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */ + ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */ + + gle = _mm_unpacklo_pi16(cbe, cre); + ghe = _mm_unpackhi_pi16(cbe, cre); + gle = _mm_madd_pi16(gle, PW_MF0344_F0285); + ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285); + glo = _mm_unpacklo_pi16(cbo, cro); + gho = _mm_unpackhi_pi16(cbo, cro); + glo = _mm_madd_pi16(glo, PW_MF0344_F0285); + gho = _mm_madd_pi16(gho, PW_MF0344_F0285); + + gle = _mm_add_pi32(gle, PD_ONEHALF); + ghe = _mm_add_pi32(ghe, PD_ONEHALF); + gle = _mm_srai_pi32(gle, SCALEBITS); + ghe = _mm_srai_pi32(ghe, SCALEBITS); + glo = _mm_add_pi32(glo, PD_ONEHALF); + gho = _mm_add_pi32(gho, PD_ONEHALF); + glo = _mm_srai_pi32(glo, SCALEBITS); + gho = _mm_srai_pi32(gho, SCALEBITS); + + ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */ + go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */ + ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ + go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ + + ye = _mm_and_si64(mask, y); /* Y(0246) */ + yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */ + + re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */ + ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */ + re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */ + ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */ + + ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */ + go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */ + ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */ + go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */ + + be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */ + bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */ + be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */ + bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */ + +#if RGB_PIXELSIZE == 3 + + /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ + /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ + mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ + + mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT); + + mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */ + + mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); + mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */ + + mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */ + mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */ + + mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */ + mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */ + + mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */ + mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */ + mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */ + + if (num_cols >= 8) { + if (!(((long)outptr) & 7)) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmE); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + } else { + _mm_storeu_si64((__m64 *)outptr, mmA); + _mm_storeu_si64((__m64 *)(outptr + 8), mmE); + _mm_storeu_si64((__m64 *)(outptr + 16), mmC); + } + outptr += RGB_PIXELSIZE * 8; + } else { + col = num_cols * 3; + asm(".set noreorder\r\n" + + "li $8, 16\r\n" + "move $9, %4\r\n" + "mov.s $f4, %1\r\n" + "mov.s $f6, %3\r\n" + "move $10, %5\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, %2\r\n" + "subu $9, $9, 16\r\n" + PTR_ADDU "$10, $10, 16\r\n" + "b 2f\r\n" + "nop \r\n" + + "1: \r\n" + "li $8, 8\r\n" /* st8 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "mov.s $f4, %3\r\n" + "subu $9, $9, 8\r\n" + PTR_ADDU "$10, $10, 8\r\n" + + "2: \r\n" + "li $8, 4\r\n" /* st4 */ + "mfc1 $11, $f4\r\n" + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "swl $11, 3($10)\r\n" + "swr $11, 0($10)\r\n" + "li $8, 32\r\n" + "mtc1 $8, $f6\r\n" + "dsrl $f4, $f4, $f6\r\n" + "mfc1 $11, $f4\r\n" + "subu $9, $9, 4\r\n" + PTR_ADDU "$10, $10, 4\r\n" + + "3: \r\n" + "li $8, 2\r\n" /* st2 */ + "bltu $9, $8, 4f\r\n" + "nop \r\n" + "ush $11, 0($10)\r\n" + "srl $11, 16\r\n" + "subu $9, $9, 2\r\n" + PTR_ADDU "$10, $10, 2\r\n" + + "4: \r\n" + "li $8, 1\r\n" /* st1 */ + "bltu $9, $8, 5f\r\n" + "nop \r\n" + "sb $11, 0($10)\r\n" + + "5: \r\n" + "nop \r\n" /* end */ + : "=m" (*outptr) + : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr) + : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory" + ); + } + +#else /* RGB_PIXELSIZE == 4 */ + +#ifdef RGBX_FILLER_0XFF + xe = _mm_cmpeq_pi8(xe, xe); + xo = _mm_cmpeq_pi8(xo, xo); +#else + xe = _mm_xor_si64(xe, xe); + xo = _mm_xor_si64(xo, xo); +#endif + /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ + /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ + /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ + /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ + + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ + mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ + mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ + + mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */ + mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */ + mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */ + + mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */ + mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */ + mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */ + mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */ + + if (num_cols >= 8) { + if (!(((long)outptr) & 7)) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmD); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + _mm_store_si64((__m64 *)(outptr + 24), mmH); + } else { + _mm_storeu_si64((__m64 *)outptr, mmA); + _mm_storeu_si64((__m64 *)(outptr + 8), mmD); + _mm_storeu_si64((__m64 *)(outptr + 16), mmC); + _mm_storeu_si64((__m64 *)(outptr + 24), mmH); + } + outptr += RGB_PIXELSIZE * 8; + } else { + col = num_cols; + asm(".set noreorder\r\n" /* st16 */ + + "li $8, 4\r\n" + "move $9, %6\r\n" + "move $10, %7\r\n" + "mov.s $f4, %2\r\n" + "mov.s $f6, %4\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, %3\r\n" + "mov.s $f6, %5\r\n" + "subu $9, $9, 4\r\n" + PTR_ADDU "$10, $10, 16\r\n" + + "1: \r\n" + "li $8, 2\r\n" /* st8 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "mov.s $f4, $f6\r\n" + "subu $9, $9, 2\r\n" + PTR_ADDU "$10, $10, 8\r\n" + + "2: \r\n" + "li $8, 1\r\n" /* st4 */ + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "gsswlc1 $f4, 3($10)\r\n" + "gsswrc1 $f4, 0($10)\r\n" + + "3: \r\n" + "li %1, 0\r\n" /* end */ + : "=m" (*outptr), "=r" (col) + : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col), + "r" (outptr) + : "$f4", "$f6", "$8", "$9", "$10", "memory" + ); + } + +#endif + + } + } +} + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c new file mode 100644 index 0000000000..2c58263dbd --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c @@ -0,0 +1,139 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_344 ((short)22554) /* FIX(0.34414) */ +#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */ + +enum const_index { + index_PW_ONE, + index_PW_F0402, + index_PW_MF0228, + index_PW_MF0344_F0285, + index_PD_ONEHALF +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(1, 1, 1, 1), + _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402), + _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228), + _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344), + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))) +}; + +#define PW_ONE get_const_value(index_PW_ONE) +#define PW_F0402 get_const_value(index_PW_F0402) +#define PW_MF0228 get_const_value(index_PW_MF0228) +#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285) +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) + +#define RGBX_FILLER_0XFF 1 + + +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c new file mode 100644 index 0000000000..0a39bd5680 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c @@ -0,0 +1,149 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhangLixia + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_344 ((short)22554) /* FIX(0.34414) */ +#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */ + +enum const_index { + index_PW_ONE, + index_PW_F0402, + index_PW_MF0228, + index_PW_MF0344_F0285, + index_PD_ONEHALF +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(1, 1, 1, 1), + _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402), + _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228), + _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344), + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))) +}; + +#define PW_ONE get_const_value(index_PW_ONE) +#define PW_F0402 get_const_value(index_PW_F0402) +#define PW_MF0228 get_const_value(index_PW_MF0228) +#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285) +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) + +#define RGBX_FILLER_0XFF 1 + + +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgb_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgb_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgbx_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgbx_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgr_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgr_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgrx_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgrx_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxbgr_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxbgr_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxrgb_merged_upsample_mmi +#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxrgb_merged_upsample_mmi +#include "jdmrgext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_mmi +#undef jsimd_h2v2_merged_upsample_mmi diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c new file mode 100644 index 0000000000..be09ff2a65 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c @@ -0,0 +1,615 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhangLixia + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-mmi.c */ + + +#if RGB_RED == 0 +#define mmA re +#define mmB ro +#elif RGB_GREEN == 0 +#define mmA ge +#define mmB go +#elif RGB_BLUE == 0 +#define mmA be +#define mmB bo +#else +#define mmA xe +#define mmB xo +#endif + +#if RGB_RED == 1 +#define mmC re +#define mmD ro +#elif RGB_GREEN == 1 +#define mmC ge +#define mmD go +#elif RGB_BLUE == 1 +#define mmC be +#define mmD bo +#else +#define mmC xe +#define mmD xo +#endif + +#if RGB_RED == 2 +#define mmE re +#define mmF ro +#elif RGB_GREEN == 2 +#define mmE ge +#define mmF go +#elif RGB_BLUE == 2 +#define mmE be +#define mmF bo +#else +#define mmE xe +#define mmF xo +#endif + +#if RGB_RED == 3 +#define mmG re +#define mmH ro +#elif RGB_GREEN == 3 +#define mmG ge +#define mmH go +#elif RGB_BLUE == 3 +#define mmG be +#define mmH bo +#else +#define mmG xe +#define mmH xo +#endif + + +void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int num_cols, col; + __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y; + __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr; + __m64 rle, rlo, rl, rhe, rho, rh, re, ro; + __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go; + __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0; + __m64 decenter, mask, zero = 0.0; +#if RGB_PIXELSIZE == 4 + __m64 mm8, mm9; +#endif + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8, + inptr0 += 16, inptr1 += 8, inptr2 += 8) { + + cb = _mm_load_si64((__m64 *)inptr1); + cr = _mm_load_si64((__m64 *)inptr2); + ythis = _mm_load_si64((__m64 *)inptr0); + ynext = _mm_load_si64((__m64 *)inptr0 + 1); + + mask = decenter = 0.0; + mask = _mm_cmpeq_pi16(mask, mask); + decenter = _mm_cmpeq_pi16(decenter, decenter); + mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ + decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + + cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ + cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */ + crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */ + crh = _mm_unpackhi_pi8(cr, zero); /* Cr(4567) */ + cbl = _mm_add_pi16(cbl, decenter); + cbh = _mm_add_pi16(cbh, decenter); + crl = _mm_add_pi16(crl, decenter); + crh = _mm_add_pi16(crh, decenter); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + + cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ + cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */ + crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ + crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */ + + bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ + bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */ + rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ + rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */ + + bl = _mm_add_pi16(bl, PW_ONE); + bh = _mm_add_pi16(bh, PW_ONE); + bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */ + bh = _mm_srai_pi16(bh, 1); /* (CbH * -FIX(0.22800)) */ + rl = _mm_add_pi16(rl, PW_ONE); + rh = _mm_add_pi16(rh, PW_ONE); + rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */ + rh = _mm_srai_pi16(rh, 1); /* (CrH * FIX(0.40200)) */ + + bl = _mm_add_pi16(bl, cbl); + bh = _mm_add_pi16(bh, cbh); + bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ + bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */ + rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ + rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */ + + ga = _mm_unpacklo_pi16(cbl, crl); + gb = _mm_unpackhi_pi16(cbl, crl); + ga = _mm_madd_pi16(ga, PW_MF0344_F0285); + gb = _mm_madd_pi16(gb, PW_MF0344_F0285); + gc = _mm_unpacklo_pi16(cbh, crh); + gd = _mm_unpackhi_pi16(cbh, crh); + gc = _mm_madd_pi16(gc, PW_MF0344_F0285); + gd = _mm_madd_pi16(gd, PW_MF0344_F0285); + + ga = _mm_add_pi32(ga, PD_ONEHALF); + gb = _mm_add_pi32(gb, PD_ONEHALF); + ga = _mm_srai_pi32(ga, SCALEBITS); + gb = _mm_srai_pi32(gb, SCALEBITS); + gc = _mm_add_pi32(gc, PD_ONEHALF); + gd = _mm_add_pi32(gd, PD_ONEHALF); + gc = _mm_srai_pi32(gc, SCALEBITS); + gd = _mm_srai_pi32(gd, SCALEBITS); + + gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ + gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */ + gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ + gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */ + + ythise = _mm_and_si64(mask, ythis); /* Y(0246) */ + ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */ + ynexte = _mm_and_si64(mask, ynext); /* Y(8ACE) */ + ynexto = _mm_srli_pi16(ynext, BYTE_BIT); /* Y(9BDF) */ + + rle = _mm_add_pi16(rl, ythise); /* (R0 R2 R4 R6) */ + rlo = _mm_add_pi16(rl, ythiso); /* (R1 R3 R5 R7) */ + rhe = _mm_add_pi16(rh, ynexte); /* (R8 RA RC RE) */ + rho = _mm_add_pi16(rh, ynexto); /* (R9 RB RD RF) */ + re = _mm_packs_pu16(rle, rhe); /* (R0 R2 R4 R6 R8 RA RC RE) */ + ro = _mm_packs_pu16(rlo, rho); /* (R1 R3 R5 R7 R9 RB RD RF) */ + + gle = _mm_add_pi16(gl, ythise); /* (G0 G2 G4 G6) */ + glo = _mm_add_pi16(gl, ythiso); /* (G1 G3 G5 G7) */ + ghe = _mm_add_pi16(gh, ynexte); /* (G8 GA GC GE) */ + gho = _mm_add_pi16(gh, ynexto); /* (G9 GB GD GF) */ + ge = _mm_packs_pu16(gle, ghe); /* (G0 G2 G4 G6 G8 GA GC GE) */ + go = _mm_packs_pu16(glo, gho); /* (G1 G3 G5 G7 G9 GB GD GF) */ + + ble = _mm_add_pi16(bl, ythise); /* (B0 B2 B4 B6) */ + blo = _mm_add_pi16(bl, ythiso); /* (B1 B3 B5 B7) */ + bhe = _mm_add_pi16(bh, ynexte); /* (B8 BA BC BE) */ + bho = _mm_add_pi16(bh, ynexto); /* (B9 BB BD BF) */ + be = _mm_packs_pu16(ble, bhe); /* (B0 B2 B4 B6 B8 BA BC BE) */ + bo = _mm_packs_pu16(blo, bho); /* (B1 B3 B5 B7 B9 BB BD BF) */ + +#if RGB_PIXELSIZE == 3 + + /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ + /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ + /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ + mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ + mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ + mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */ + mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ + mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */ + + mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */ + mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */ + mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */ + mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */ + mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */ + mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */ + + mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */ + mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT); + mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */ + mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */ + mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */ + mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT); + mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */ + mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */ + + mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */ + mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */ + mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */ + mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */ + mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */ + mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */ + + if (num_cols >= 8) { + if (!(((long)outptr) & 7)) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmB); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + _mm_store_si64((__m64 *)(outptr + 24), mmE); + _mm_store_si64((__m64 *)(outptr + 32), mmF); + _mm_store_si64((__m64 *)(outptr + 40), mmG); + } else { + _mm_storeu_si64((__m64 *)outptr, mmA); + _mm_storeu_si64((__m64 *)(outptr + 8), mmB); + _mm_storeu_si64((__m64 *)(outptr + 16), mmC); + _mm_storeu_si64((__m64 *)(outptr + 24), mmE); + _mm_storeu_si64((__m64 *)(outptr + 32), mmF); + _mm_storeu_si64((__m64 *)(outptr + 40), mmG); + } + outptr += RGB_PIXELSIZE * 16; + } else { + if (output_width & 1) + col = num_cols * 6 + 3; + else + col = num_cols * 6; + + asm(".set noreorder\r\n" /* st24 */ + + "li $8, 24\r\n" + "move $9, %7\r\n" + "mov.s $f4, %1\r\n" + "mov.s $f6, %2\r\n" + "mov.s $f8, %3\r\n" + "move $10, %8\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "gssdlc1 $f8, 7+16($10)\r\n" + "gssdrc1 $f8, 16($10)\r\n" + "mov.s $f4, %4\r\n" + "mov.s $f6, %5\r\n" + "mov.s $f8, %6\r\n" + "subu $9, $9, 24\r\n" + PTR_ADDU "$10, $10, 24\r\n" + + "1: \r\n" + "li $8, 16\r\n" /* st16 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, $f8\r\n" + "subu $9, $9, 16\r\n" + PTR_ADDU "$10, $10, 16\r\n" + + "2: \r\n" + "li $8, 8\r\n" /* st8 */ + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "mov.s $f4, $f6\r\n" + "subu $9, $9, 8\r\n" + PTR_ADDU "$10, $10, 8\r\n" + + "3: \r\n" + "li $8, 4\r\n" /* st4 */ + "mfc1 $11, $f4\r\n" + "bltu $9, $8, 4f\r\n" + "nop \r\n" + "swl $11, 3($10)\r\n" + "swr $11, 0($10)\r\n" + "li $8, 32\r\n" + "mtc1 $8, $f6\r\n" + "dsrl $f4, $f4, $f6\r\n" + "mfc1 $11, $f4\r\n" + "subu $9, $9, 4\r\n" + PTR_ADDU "$10, $10, 4\r\n" + + "4: \r\n" + "li $8, 2\r\n" /* st2 */ + "bltu $9, $8, 5f\r\n" + "nop \r\n" + "ush $11, 0($10)\r\n" + "srl $11, 16\r\n" + "subu $9, $9, 2\r\n" + PTR_ADDU "$10, $10, 2\r\n" + + "5: \r\n" + "li $8, 1\r\n" /* st1 */ + "bltu $9, $8, 6f\r\n" + "nop \r\n" + "sb $11, 0($10)\r\n" + + "6: \r\n" + "nop \r\n" /* end */ + : "=m" (*outptr) + : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF), + "f" (mmG), "r" (col), "r" (outptr) + : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory" + ); + } + +#else /* RGB_PIXELSIZE == 4 */ + +#ifdef RGBX_FILLER_0XFF + xe = _mm_cmpeq_pi8(xe, xe); + xo = _mm_cmpeq_pi8(xo, xo); +#else + xe = _mm_xor_si64(xe, xe); + xo = _mm_xor_si64(xo, xo); +#endif + /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ + /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ + /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ + /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */ + + mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ + mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ + mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */ + + mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ + mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */ + mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ + mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */ + + mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */ + mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */ + mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */ + mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */ + + mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */ + mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */ + mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */ + mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */ + + mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */ + mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */ + mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */ + mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */ + + mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */ + mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */ + mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */ + mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */ + + if (num_cols >= 8) { + if (!(((long)outptr) & 7)) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmB); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + _mm_store_si64((__m64 *)(outptr + 24), mmD); + _mm_store_si64((__m64 *)(outptr + 32), mmE); + _mm_store_si64((__m64 *)(outptr + 40), mmF); + _mm_store_si64((__m64 *)(outptr + 48), mmG); + _mm_store_si64((__m64 *)(outptr + 56), mmH); + } else { + _mm_storeu_si64((__m64 *)outptr, mmA); + _mm_storeu_si64((__m64 *)(outptr + 8), mmB); + _mm_storeu_si64((__m64 *)(outptr + 16), mmC); + _mm_storeu_si64((__m64 *)(outptr + 24), mmD); + _mm_storeu_si64((__m64 *)(outptr + 32), mmE); + _mm_storeu_si64((__m64 *)(outptr + 40), mmF); + _mm_storeu_si64((__m64 *)(outptr + 48), mmG); + _mm_storeu_si64((__m64 *)(outptr + 56), mmH); + } + outptr += RGB_PIXELSIZE * 16; + } else { + if (output_width & 1) + col = num_cols * 2 + 1; + else + col = num_cols * 2; + asm(".set noreorder\r\n" /* st32 */ + + "li $8, 8\r\n" + "move $9, %10\r\n" + "move $10, %11\r\n" + "mov.s $f4, %2\r\n" + "mov.s $f6, %3\r\n" + "mov.s $f8, %4\r\n" + "mov.s $f10, %5\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "gssdlc1 $f8, 7+16($10)\r\n" + "gssdrc1 $f8, 16($10)\r\n" + "gssdlc1 $f10, 7+24($10)\r\n" + "gssdrc1 $f10, 24($10)\r\n" + "mov.s $f4, %6\r\n" + "mov.s $f6, %7\r\n" + "mov.s $f8, %8\r\n" + "mov.s $f10, %9\r\n" + "subu $9, $9, 8\r\n" + PTR_ADDU "$10, $10, 32\r\n" + + "1: \r\n" + "li $8, 4\r\n" /* st16 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, $f8\r\n" + "mov.s $f6, $f10\r\n" + "subu $9, $9, 4\r\n" + PTR_ADDU "$10, $10, 16\r\n" + + "2: \r\n" + "li $8, 2\r\n" /* st8 */ + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "mov.s $f4, $f6\r\n" + "subu $9, $9, 2\r\n" + PTR_ADDU "$10, $10, 8\r\n" + + "3: \r\n" + "li $8, 1\r\n" /* st4 */ + "bltu $9, $8, 4f\r\n" + "nop \r\n" + "gsswlc1 $f4, 3($10)\r\n" + "gsswrc1 $f4, 0($10)\r\n" + + "4: \r\n" + "li %1, 0\r\n" /* end */ + : "=m" (*outptr), "=r" (col) + : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF), + "f" (mmG), "f" (mmH), "r" (col), "r" (outptr) + : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory" + ); + } + +#endif + + } + + if (!((output_width >> 1) & 7)) { + if (output_width & 1) { + cb = _mm_load_si64((__m64 *)inptr1); + cr = _mm_load_si64((__m64 *)inptr2); + y = _mm_load_si64((__m64 *)inptr0); + + decenter = 0.0; + decenter = _mm_cmpeq_pi16(decenter, decenter); + decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + + cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ + crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */ + cbl = _mm_add_pi16(cbl, decenter); + crl = _mm_add_pi16(crl, decenter); + + cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ + crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ + bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ + rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ + + bl = _mm_add_pi16(bl, PW_ONE); + bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */ + rl = _mm_add_pi16(rl, PW_ONE); + rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */ + + bl = _mm_add_pi16(bl, cbl); + bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ + rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ + + gl = _mm_unpacklo_pi16(cbl, crl); + gl = _mm_madd_pi16(gl, PW_MF0344_F0285); + gl = _mm_add_pi32(gl, PD_ONEHALF); + gl = _mm_srai_pi32(gl, SCALEBITS); + gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ + gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ + + yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */ + rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */ + gl = _mm_add_pi16(gl, yl); /* (G0 G1 G2 G3) */ + bl = _mm_add_pi16(bl, yl); /* (B0 B1 B2 B3) */ + re = _mm_packs_pu16(rl, rl); + ge = _mm_packs_pu16(gl, gl); + be = _mm_packs_pu16(bl, bl); +#if RGB_PIXELSIZE == 3 + mmA = _mm_unpacklo_pi8(mmA, mmC); + mmA = _mm_unpacklo_pi16(mmA, mmE); + asm(".set noreorder\r\n" + + "move $8, %2\r\n" + "mov.s $f4, %1\r\n" + "mfc1 $9, $f4\r\n" + "ush $9, 0($8)\r\n" + "srl $9, 16\r\n" + "sb $9, 2($8)\r\n" + : "=m" (*outptr) + : "f" (mmA), "r" (outptr) + : "$f4", "$8", "$9", "memory" + ); +#else /* RGB_PIXELSIZE == 4 */ + +#ifdef RGBX_FILLER_0XFF + xe = _mm_cmpeq_pi8(xe, xe); +#else + xe = _mm_xor_si64(xe, xe); +#endif + mmA = _mm_unpacklo_pi8(mmA, mmC); + mmE = _mm_unpacklo_pi8(mmE, mmG); + mmA = _mm_unpacklo_pi16(mmA, mmE); + asm(".set noreorder\r\n" + + "move $8, %2\r\n" + "mov.s $f4, %1\r\n" + "gsswlc1 $f4, 3($8)\r\n" + "gsswrc1 $f4, 0($8)\r\n" + : "=m" (*outptr) + : "f" (mmA), "r" (outptr) + : "$f4", "$8", "memory" + ); +#endif + } + } +} + + +void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW inptr, outptr; + + inptr = input_buf[0][in_row_group_ctr]; + outptr = output_buf[0]; + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; + jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; + output_buf[0] = output_buf[1]; + jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = inptr; + output_buf[0] = outptr; +} + + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c new file mode 100644 index 0000000000..8ae94e7dcf --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c @@ -0,0 +1,304 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * ZhangLixia + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_mmi.h" + + +enum const_index { + index_PW_ONE, + index_PW_TWO, + index_PW_THREE, + index_PW_SEVEN, + index_PW_EIGHT, +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(1, 1, 1, 1), + _uint64_set_pi16(2, 2, 2, 2), + _uint64_set_pi16(3, 3, 3, 3), + _uint64_set_pi16(7, 7, 7, 7), + _uint64_set_pi16(8, 8, 8, 8), +}; + +#define PW_ONE get_const_value(index_PW_ONE) +#define PW_TWO get_const_value(index_PW_TWO) +#define PW_THREE get_const_value(index_PW_THREE) +#define PW_SEVEN get_const_value(index_PW_SEVEN) +#define PW_EIGHT get_const_value(index_PW_EIGHT) + + +#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \ + __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \ + __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \ + __m64 outle, outhe, outlo, outho, outl, outh; \ + \ + samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT); /* ( 1 2 3 -) */ \ + sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 4) */ \ + samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 3 - - -) */ \ + sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT); /* ( - 4 5 6) */ \ + \ + samp1234 = _mm_or_si64(samp123X, sampXXX4); /* ( 1 2 3 4) */ \ + samp3456 = _mm_or_si64(samp3XXX, sampX456); /* ( 3 4 5 6) */ \ + \ + sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT); /* ( - 0 1 2) */ \ + samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT); /* ( 5 6 7 -) */ \ + samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 7 - - -) */ \ + \ + samp_1012 = _mm_or_si64(sampX012, wk[row]); /* (-1 0 1 2) */ \ + samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]); /* ( 5 6 7 8) */ \ + \ + wk[row] = samp7XXX; \ + \ + samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \ + samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \ + samp_1012 = _mm_add_pi16(samp_1012, bias1); \ + samp3456 = _mm_add_pi16(samp3456, bias1); \ + samp1234 = _mm_add_pi16(samp1234, bias2); \ + samp5678 = _mm_add_pi16(samp5678, bias2); \ + \ + outle = _mm_add_pi16(samp_1012, samp0123); \ + outhe = _mm_add_pi16(samp3456, samp4567); \ + outle = _mm_srli_pi16(outle, shift); /* ( 0 2 4 6) */ \ + outhe = _mm_srli_pi16(outhe, shift); /* ( 8 10 12 14) */ \ + outlo = _mm_add_pi16(samp1234, samp0123); \ + outho = _mm_add_pi16(samp5678, samp4567); \ + outlo = _mm_srli_pi16(outlo, shift); /* ( 1 3 5 7) */ \ + outho = _mm_srli_pi16(outho, shift); /* ( 9 11 13 15) */ \ + \ + outlo = _mm_slli_pi16(outlo, BYTE_BIT); \ + outho = _mm_slli_pi16(outho, BYTE_BIT); \ + outl = _mm_or_si64(outle, outlo); /* ( 0 1 2 3 4 5 6 7) */ \ + outh = _mm_or_si64(outhe, outho); /* ( 8 9 10 11 12 13 14 15) */ \ + \ + _mm_store_si64((__m64 *)outptr##row, outl); \ + _mm_store_si64((__m64 *)outptr##row + 1, outh); \ +} + +void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol, tmp, tmp1; + __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h; + __m64 this0l, this0h, this0; + __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h; + __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h; + __m64 next0l, next0h, next0; + __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h; + __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0; + + mask0 = _mm_cmpeq_pi8(mask0, mask0); + masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); + mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 7) { + tmp = (downsampled_width - 1) * sizeof(JSAMPLE); + tmp1 = downsampled_width * sizeof(JSAMPLE); + asm(PTR_ADDU "$8, %3, %6\r\n" + "lb $9, ($8)\r\n" + PTR_ADDU "$8, %3, %7\r\n" + "sb $9, ($8)\r\n" + PTR_ADDU "$8, %4, %6\r\n" + "lb $9, ($8)\r\n" + PTR_ADDU "$8, %4, %7\r\n" + "sb $9, ($8)\r\n" + PTR_ADDU "$8, %5, %6\r\n" + "lb $9, ($8)\r\n" + PTR_ADDU "$8, %5, %7\r\n" + "sb $9, ($8)\r\n" + : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1) + : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1) + : "$8", "$9" + ); + } + + /* process the first column block */ + this0 = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */ + this_1 = _mm_load_si64((__m64 *)inptr_1); /* row[-1][0] */ + this1 = _mm_load_si64((__m64 *)inptr1); /* row[ 1][0] */ + + this0l = _mm_unpacklo_pi8(this0, zero); /* row[ 0][0]( 0 1 2 3) */ + this0h = _mm_unpackhi_pi8(this0, zero); /* row[ 0][0]( 4 5 6 7) */ + this_1l = _mm_unpacklo_pi8(this_1, zero); /* row[-1][0]( 0 1 2 3) */ + this_1h = _mm_unpackhi_pi8(this_1, zero); /* row[-1][0]( 4 5 6 7) */ + this1l = _mm_unpacklo_pi8(this1, zero); /* row[+1][0]( 0 1 2 3) */ + this1h = _mm_unpackhi_pi8(this1, zero); /* row[+1][0]( 4 5 6 7) */ + + this0l = _mm_mullo_pi16(this0l, PW_THREE); + this0h = _mm_mullo_pi16(this0h, PW_THREE); + + thiscolsum_1l = _mm_add_pi16(this_1l, this0l); /* ( 0 1 2 3) */ + thiscolsum_1h = _mm_add_pi16(this_1h, this0h); /* ( 4 5 6 7) */ + thiscolsum1l = _mm_add_pi16(this0l, this1l); /* ( 0 1 2 3) */ + thiscolsum1h = _mm_add_pi16(this0h, this1h); /* ( 4 5 6 7) */ + + /* temporarily save the intermediate data */ + _mm_store_si64((__m64 *)outptr0, thiscolsum_1l); + _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h); + _mm_store_si64((__m64 *)outptr1, thiscolsum1l); + _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h); + + wk[0] = _mm_and_si64(thiscolsum_1l, mask0); /* ( 0 - - -) */ + wk[1] = _mm_and_si64(thiscolsum1l, mask0); /* ( 0 - - -) */ + + for (incol = downsampled_width; incol > 0; + incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8, + outptr0 += 16, outptr1 += 16) { + + if (incol > 8) { + /* process the next column block */ + next0 = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */ + next_1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* row[-1][1] */ + next1 = _mm_load_si64((__m64 *)inptr1 + 1); /* row[+1][1] */ + + next0l = _mm_unpacklo_pi8(next0, zero); /* row[ 0][1]( 0 1 2 3) */ + next0h = _mm_unpackhi_pi8(next0, zero); /* row[ 0][1]( 4 5 6 7) */ + next_1l = _mm_unpacklo_pi8(next_1, zero); /* row[-1][1]( 0 1 2 3) */ + next_1h = _mm_unpackhi_pi8(next_1, zero); /* row[-1][1]( 4 5 6 7) */ + next1l = _mm_unpacklo_pi8(next1, zero); /* row[+1][1]( 0 1 2 3) */ + next1h = _mm_unpackhi_pi8(next1, zero); /* row[+1][1]( 4 5 6 7) */ + + next0l = _mm_mullo_pi16(next0l, PW_THREE); + next0h = _mm_mullo_pi16(next0h, PW_THREE); + + nextcolsum_1l = _mm_add_pi16(next_1l, next0l); /* ( 0 1 2 3) */ + nextcolsum_1h = _mm_add_pi16(next_1h, next0h); /* ( 4 5 6 7) */ + nextcolsum1l = _mm_add_pi16(next0l, next1l); /* ( 0 1 2 3) */ + nextcolsum1h = _mm_add_pi16(next0h, next1h); /* ( 4 5 6 7) */ + + /* temporarily save the intermediate data */ + _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l); + _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h); + _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l); + _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h); + + wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */ + wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */ + } else { + __m64 tmp; + + /* process the last column block */ + tmp = _mm_load_si64((__m64 *)outptr0 + 1); + wk[2] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */ + tmp = _mm_load_si64((__m64 *)outptr1 + 1); + wk[3] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */ + } + + /* process the upper row */ + samp0123 = _mm_load_si64((__m64 *)outptr0); /* ( 0 1 2 3) */ \ + samp4567 = _mm_load_si64((__m64 *)outptr0 + 1); /* ( 4 5 6 7) */ \ + PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4) + + /* process the lower row */ + samp0123 = _mm_load_si64((__m64 *)outptr1); /* ( 0 1 2 3) */ \ + samp4567 = _mm_load_si64((__m64 *)outptr1 + 1); /* ( 4 5 6 7) */ \ + PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4) + } + } +} + + +void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, outptr0; + int inrow, incol, tmp, tmp1; + __m64 thisl, this, nextl, next; + __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0; + + mask0 = _mm_cmpeq_pi8(mask0, mask0); + masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); + mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + + inptr0 = input_data[inrow]; + outptr0 = output_data[inrow]; + + if (downsampled_width & 7) { + tmp = (downsampled_width - 1) * sizeof(JSAMPLE); + tmp1 = downsampled_width * sizeof(JSAMPLE); + asm(PTR_ADDU "$8, %1, %2\r\n" + "lb $9, ($8)\r\n" + PTR_ADDU "$8, %1, %3\r\n" + "sb $9, ($8)\r\n" + : "=m" (*inptr0) + : "r" (inptr0), "r" (tmp), "r" (tmp1) + : "$8", "$9" + ); + } + + /* process the first column block */ + this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */ + thisl = _mm_unpacklo_pi8(this, zero); /* row[ 0][0]( 0 1 2 3) */ + wk[0] = _mm_and_si64(thisl, mask0); /* ( 0 - - -) */ + + for (incol = downsampled_width; incol > 0; + incol -= 8, inptr0 += 8, outptr0 += 16) { + + if (incol > 8) { + /* process the next column block */ + next = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */ + nextl = _mm_unpacklo_pi8(next, zero); /* row[ 0][1]( 0 1 2 3) */ + wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */ + } else { + __m64 thish; + + /* process the last column block */ + this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */ + thish = _mm_unpackhi_pi8(this, zero); /* row[ 0][1]( 4 5 6 7) */ + wk[1] = _mm_and_si64(masklast, thish); /* ( - - - 7) */ + } + + /* process the row */ + this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */ + samp0123 = _mm_unpacklo_pi8(this, zero); /* ( 0 1 2 3) */ + samp4567 = _mm_unpackhi_pi8(this, zero); /* ( 4 5 6 7) */ + PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2) + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c new file mode 100644 index 0000000000..f7caf09a88 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c @@ -0,0 +1,255 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014, 2018-2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: LiuQingfa + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER FORWARD DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 8 + +#define F_0_382 ((short)98) /* FIX(0.382683433) */ +#define F_0_541 ((short)139) /* FIX(0.541196100) */ +#define F_0_707 ((short)181) /* FIX(0.707106781) */ +#define F_1_306 ((short)334) /* FIX(1.306562965) */ + +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + +enum const_index { + index_PW_F0707, + index_PW_F0382, + index_PW_F0541, + index_PW_F1306 +}; + +static uint64_t const_value[] = { + _uint64_set1_pi16(F_0_707), + _uint64_set1_pi16(F_0_382), + _uint64_set1_pi16(F_0_541), + _uint64_set1_pi16(F_1_306) +}; + +#define PW_F0707 get_const_value(index_PW_F0707) +#define PW_F0382 get_const_value(index_PW_F0382) +#define PW_F0541 get_const_value(index_PW_F0541) +#define PW_F1306 get_const_value(index_PW_F1306) + + +#define DO_FDCT_MULTIPLY(out, in, multiplier) { \ + __m64 mulhi, mullo, mul12, mul34; \ + \ + mullo = _mm_mullo_pi16(in, multiplier); \ + mulhi = _mm_mulhi_pi16(in, multiplier); \ + mul12 = _mm_unpacklo_pi16(mullo, mulhi); \ + mul34 = _mm_unpackhi_pi16(mullo, mulhi); \ + mul12 = _mm_srai_pi32(mul12, CONST_BITS); \ + mul34 = _mm_srai_pi32(mul34, CONST_BITS); \ + out = _mm_packs_pi32(mul12, mul34); \ +} + +#define DO_FDCT_COMMON() { \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp3); \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); \ + tmp11 = _mm_add_pi16(tmp1, tmp2); \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); \ + \ + out0 = _mm_add_pi16(tmp10, tmp11); \ + out4 = _mm_sub_pi16(tmp10, tmp11); \ + \ + z1 = _mm_add_pi16(tmp12, tmp13); \ + DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \ + \ + out2 = _mm_add_pi16(tmp13, z1); \ + out6 = _mm_sub_pi16(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = _mm_add_pi16(tmp4, tmp5); \ + tmp11 = _mm_add_pi16(tmp5, tmp6); \ + tmp12 = _mm_add_pi16(tmp6, tmp7); \ + \ + z5 = _mm_sub_pi16(tmp10, tmp12); \ + DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \ + \ + DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \ + z2 = _mm_add_pi16(z2, z5); \ + \ + DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \ + z4 = _mm_add_pi16(z4, z5); \ + \ + DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \ + \ + z11 = _mm_add_pi16(tmp7, z3); \ + z13 = _mm_sub_pi16(tmp7, z3); \ + \ + out5 = _mm_add_pi16(z13, z2); \ + out3 = _mm_sub_pi16(z13, z2); \ + out1 = _mm_add_pi16(z11, z4); \ + out7 = _mm_sub_pi16(z11, z4); \ +} + +#define DO_FDCT_PASS1() { \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ + \ + row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ + row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ + row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ + row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ + row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ + row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ + row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ + \ + row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ + row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ + row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ + row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ + \ + col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ + col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ + col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ + col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ + \ + tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ + tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ + tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ + tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ + \ + col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ + col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ + col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ + col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ + \ + tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ + tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ + tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ + tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ + \ + DO_FDCT_COMMON() \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \ +} + +#define DO_FDCT_PASS2() { \ + __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \ + __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \ + __m64 row0, row1, row2, row3, row4, row5, row6, row7; \ + \ + col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \ + col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \ + col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \ + col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \ + \ + /* Transpose coefficients */ \ + \ + col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ + col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ + col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ + col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ + \ + col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ + col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ + col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ + col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ + \ + row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ + row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ + row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ + row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ + \ + tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ + tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ + tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ + tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ + \ + row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ + row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ + row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ + row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ + \ + tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ + tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ + tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ + tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ + \ + DO_FDCT_COMMON() \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \ +} + +void jsimd_fdct_ifast_mmi(DCTELEM *data) +{ + __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13; + DCTELEM *dataptr = data; + + /* Pass 1: process rows. */ + + DO_FDCT_PASS1() + dataptr += DCTSIZE * 4; + DO_FDCT_PASS1() + + /* Pass 2: process columns. */ + + dataptr = data; + DO_FDCT_PASS2() + dataptr += 4; + DO_FDCT_PASS2() +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c new file mode 100644 index 0000000000..7f4dfe9123 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c @@ -0,0 +1,398 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014, 2018, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER FORWARD DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ +#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ +#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ +#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ +#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ +#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ +#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ +#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ +#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ +#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ +#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ +#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ + +enum const_index { + index_PW_F130_F054, + index_PW_F054_MF130, + index_PW_MF078_F117, + index_PW_F117_F078, + index_PW_MF060_MF089, + index_PW_MF089_F060, + index_PW_MF050_MF256, + index_PW_MF256_F050, + index_PD_DESCALE_P1, + index_PD_DESCALE_P2, + index_PW_DESCALE_P2X +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), + FIX_0_541, (FIX_0_541 + FIX_0_765)), + _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, + (FIX_0_541 - FIX_1_847), FIX_0_541), + _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), + FIX_1_175, (FIX_1_175 - FIX_1_961)), + _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, + (FIX_1_175 - FIX_0_390), FIX_1_175), + _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), + -FIX_0_899, (FIX_0_298 - FIX_0_899)), + _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, + (FIX_1_501 - FIX_0_899), -FIX_0_899), + _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), + -FIX_2_562, (FIX_2_053 - FIX_2_562)), + _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, + (FIX_3_072 - FIX_2_562), -FIX_2_562), + _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), + _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), + _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)), + (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1))) +}; + +#define PW_F130_F054 get_const_value(index_PW_F130_F054) +#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) +#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) +#define PW_F117_F078 get_const_value(index_PW_F117_F078) +#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) +#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) +#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) +#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) +#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) +#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) +#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X) + + +#define DO_FDCT_COMMON(PASS) { \ + __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \ + __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \ + __m64 out1l, out1h, out2l, out2h, out3l, out3h; \ + __m64 out5l, out5h, out6l, out6h, out7l, out7h; \ + __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ + \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * out2 = z1 + tmp13 * 0.765366865; \ + * out6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \ + tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \ + \ + out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \ + out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \ + out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \ + out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \ + \ + out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ + out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ + out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ + out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ + \ + out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ + out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ + out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ + out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ + \ + out2 = _mm_packs_pi32(out2l, out2h); \ + out6 = _mm_packs_pi32(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = _mm_add_pi16(tmp4, tmp6); \ + z4 = _mm_add_pi16(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = _mm_unpacklo_pi16(z3, z4); \ + z34h = _mm_unpackhi_pi16(z3, z4); \ + z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ + z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ + z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ + z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \ + * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * out7 = tmp4 + z3; out5 = tmp5 + z4; \ + * out3 = tmp6 + z3; out1 = tmp7 + z4; \ + */ \ + \ + tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \ + tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \ + \ + tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \ + tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \ + tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \ + tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \ + \ + out7l = _mm_add_pi32(tmp4l, z3l); \ + out7h = _mm_add_pi32(tmp4h, z3h); \ + out1l = _mm_add_pi32(tmp7l, z4l); \ + out1h = _mm_add_pi32(tmp7h, z4h); \ + \ + out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ + out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ + out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ + out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ + \ + out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ + out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ + out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ + out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ + \ + out7 = _mm_packs_pi32(out7l, out7h); \ + out1 = _mm_packs_pi32(out1l, out1h); \ + \ + tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \ + tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \ + \ + tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \ + tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \ + tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \ + tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \ + \ + out5l = _mm_add_pi32(tmp5l, z4l); \ + out5h = _mm_add_pi32(tmp5h, z4h); \ + out3l = _mm_add_pi32(tmp6l, z3l); \ + out3h = _mm_add_pi32(tmp6h, z3h); \ + \ + out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ + out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ + out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ + out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ + \ + out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ + out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ + out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ + out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ + \ + out5 = _mm_packs_pi32(out5l, out5h); \ + out3 = _mm_packs_pi32(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() { \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ + __m64 tmp10, tmp11; \ + \ + row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ + row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ + row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ + row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ + row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ + row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ + row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ + \ + row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ + row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ + row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ + row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ + \ + col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ + col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ + col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ + col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ + \ + tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ + tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ + tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ + tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ + \ + col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ + col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ + col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ + col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ + \ + tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ + tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ + tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ + tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + out0 = _mm_slli_pi16(out0, PASS1_BITS); \ + out4 = _mm_slli_pi16(out4, PASS1_BITS); \ + \ + DO_FDCT_COMMON(1) \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \ +} + +#define DO_FDCT_PASS2() { \ + __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \ + __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \ + __m64 row0, row1, row2, row3, row4, row5, row6, row7; \ + __m64 tmp10, tmp11; \ + \ + col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \ + col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \ + col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \ + col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \ + \ + /* Transpose coefficients */ \ + \ + col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ + col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ + col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ + col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ + \ + col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ + col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ + col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ + col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ + \ + row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ + row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ + row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ + row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ + \ + tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ + tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ + tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ + tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ + \ + row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ + row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ + row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ + row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ + \ + tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ + tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ + tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ + tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + \ + out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \ + out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \ + out0 = _mm_srai_pi16(out0, PASS1_BITS); \ + out4 = _mm_srai_pi16(out4, PASS1_BITS); \ + \ + DO_FDCT_COMMON(2) \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \ +} + +void jsimd_fdct_islow_mmi(DCTELEM *data) +{ + __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + __m64 tmp12, tmp13; + DCTELEM *dataptr = data; + + /* Pass 1: process rows. */ + + DO_FDCT_PASS1() + dataptr += DCTSIZE * 4; + DO_FDCT_PASS1() + + /* Pass 2: process columns. */ + + dataptr = data; + DO_FDCT_PASS2() + dataptr += 4; + DO_FDCT_PASS2() +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c new file mode 100644 index 0000000000..503bb35a3c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c @@ -0,0 +1,395 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, 2018-2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: LiuQingfa + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER INVERSE DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 8 +#define PASS1_BITS 2 + +#define FIX_1_082 ((short)277) /* FIX(1.082392200) */ +#define FIX_1_414 ((short)362) /* FIX(1.414213562) */ +#define FIX_1_847 ((short)473) /* FIX(1.847759065) */ +#define FIX_2_613 ((short)669) /* FIX(2.613125930) */ +#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */ + +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + +enum const_index { + index_PW_F1082, + index_PW_F1414, + index_PW_F1847, + index_PW_MF1613, + index_PB_CENTERJSAMP +}; + +static uint64_t const_value[] = { + _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT), + _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT), + _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT), + _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT), + _uint64_set1_pi8(CENTERJSAMPLE) +}; + +#define PW_F1414 get_const_value(index_PW_F1414) +#define PW_F1847 get_const_value(index_PW_F1847) +#define PW_MF1613 get_const_value(index_PW_MF1613) +#define PW_F1082 get_const_value(index_PW_F1082) +#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP) + + +#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32)) +#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64)) + + +#define DO_IDCT_COMMON() { \ + tmp7 = _mm_add_pi16(z11, z13); \ + \ + tmp11 = _mm_sub_pi16(z11, z13); \ + tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ + tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \ + \ + tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \ + tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \ + \ + /* To avoid overflow... \ + * \ + * (Original) \ + * tmp12 = -2.613125930 * z10 + z5; \ + * \ + * (This implementation) \ + * tmp12 = (-1.613125930 - 1) * z10 + z5; \ + * = -1.613125930 * z10 - z10 + z5; \ + */ \ + \ + z5 = _mm_add_pi16(tmp10, tmp12); \ + z5 = _mm_mulhi_pi16(z5, PW_F1847); \ + \ + tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \ + tmp10 = _mm_sub_pi16(tmp10, z5); \ + tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \ + tmp12 = _mm_sub_pi16(tmp12, z10); \ + tmp12 = _mm_sub_pi16(tmp12, z10); \ + tmp12 = _mm_sub_pi16(tmp12, z10); \ + tmp12 = _mm_add_pi16(tmp12, z5); \ + \ + /* Final output stage */ \ + \ + tmp6 = _mm_sub_pi16(tmp12, tmp7); \ + tmp5 = _mm_sub_pi16(tmp11, tmp6); \ + tmp4 = _mm_add_pi16(tmp10, tmp5); \ + \ + out0 = _mm_add_pi16(tmp0, tmp7); \ + out7 = _mm_sub_pi16(tmp0, tmp7); \ + out1 = _mm_add_pi16(tmp1, tmp6); \ + out6 = _mm_sub_pi16(tmp1, tmp6); \ + \ + out2 = _mm_add_pi16(tmp2, tmp5); \ + out5 = _mm_sub_pi16(tmp2, tmp5); \ + out4 = _mm_add_pi16(tmp3, tmp4); \ + out3 = _mm_sub_pi16(tmp3, tmp4); \ +} + +#define DO_IDCT_PASS1(iter) { \ + __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \ + __m64 quant0l, quant1l, quant2l, quant3l; \ + __m64 quant4l, quant5l, quant6l, quant7l; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m32 col0a, col1a, mm0; \ + \ + col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \ + col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \ + mm0 = _mm_or_si32(col0a, col1a); \ + \ + if (test_m32_zero(mm0)) { \ + __m64 mm1, mm2; \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \ + \ + mm1 = _mm_or_si64(col1l, col3l); \ + mm2 = _mm_or_si64(col2l, col4l); \ + mm1 = _mm_or_si64(mm1, col5l); \ + mm2 = _mm_or_si64(mm2, col6l); \ + mm1 = _mm_or_si64(mm1, col7l); \ + mm1 = _mm_or_si64(mm1, mm2); \ + \ + if (test_m64_zero(mm1)) { \ + __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \ + \ + /* AC terms all zero */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + \ + dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \ + \ + dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ + dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ + \ + row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ + row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ + row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ + row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \ + \ + goto nextcolumn##iter; \ + } \ + } \ + \ + /* Even part */ \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \ + quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \ + quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \ + \ + tmp0 = _mm_mullo_pi16(col0l, quant0l); \ + tmp1 = _mm_mullo_pi16(col2l, quant2l); \ + tmp2 = _mm_mullo_pi16(col4l, quant4l); \ + tmp3 = _mm_mullo_pi16(col6l, quant6l); \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp2); \ + tmp11 = _mm_sub_pi16(tmp0, tmp2); \ + tmp13 = _mm_add_pi16(tmp1, tmp3); \ + \ + tmp12 = _mm_sub_pi16(tmp1, tmp3); \ + tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ + tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \ + tmp12 = _mm_sub_pi16(tmp12, tmp13); \ + \ + tmp0 = _mm_add_pi16(tmp10, tmp13); \ + tmp3 = _mm_sub_pi16(tmp10, tmp13); \ + tmp1 = _mm_add_pi16(tmp11, tmp12); \ + tmp2 = _mm_sub_pi16(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \ + \ + quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \ + quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \ + quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \ + quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \ + \ + tmp4 = _mm_mullo_pi16(col1l, quant1l); \ + tmp5 = _mm_mullo_pi16(col3l, quant3l); \ + tmp6 = _mm_mullo_pi16(col5l, quant5l); \ + tmp7 = _mm_mullo_pi16(col7l, quant7l); \ + \ + z13 = _mm_add_pi16(tmp6, tmp5); \ + z10 = _mm_sub_pi16(tmp6, tmp5); \ + z11 = _mm_add_pi16(tmp4, tmp7); \ + z12 = _mm_sub_pi16(tmp4, tmp7); \ + \ + DO_IDCT_COMMON() \ + \ + /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ + /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ + /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ + /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ + row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ + row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ + row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ + \ + row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ + row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ + row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ + row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ + \ + row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ + row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ + row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ + row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ + \ + row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ + row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ + row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ + row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \ +} + +#define DO_IDCT_PASS2(ctr) { \ + __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \ + __m64 col0123a, col0123b, col0123c, col0123d; \ + __m64 col01l, col01h, col23l, col23h; \ + __m64 col0, col1, col2, col3; \ + __m64 row06, row17, row24, row35; \ + \ + row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \ + row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \ + row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \ + row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(row0l, row4l); \ + tmp11 = _mm_sub_pi16(row0l, row4l); \ + tmp13 = _mm_add_pi16(row2l, row6l); \ + \ + tmp12 = _mm_sub_pi16(row2l, row6l); \ + tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ + tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \ + tmp12 = _mm_sub_pi16(tmp12, tmp13); \ + \ + tmp0 = _mm_add_pi16(tmp10, tmp13); \ + tmp3 = _mm_sub_pi16(tmp10, tmp13); \ + tmp1 = _mm_add_pi16(tmp11, tmp12); \ + tmp2 = _mm_sub_pi16(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + z13 = _mm_add_pi16(row5l, row3l); \ + z10 = _mm_sub_pi16(row5l, row3l); \ + z11 = _mm_add_pi16(row1l, row7l); \ + z12 = _mm_sub_pi16(row1l, row7l); \ + \ + DO_IDCT_COMMON() \ + \ + /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ + /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ + /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ + /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ + \ + out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \ + out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \ + out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \ + out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \ + out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \ + out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \ + out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \ + out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \ + \ + row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ + row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ + row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ + row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ + \ + row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ + row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ + row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \ + row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \ + \ + /* Transpose coefficients */ \ + \ + col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ + col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ + col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ + col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ + \ + col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ + col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ + col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ + col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ + \ + col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ + col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ + col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ + col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ + \ + _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \ +} + +void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m64 tmp10, tmp11, tmp12, tmp13; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + __m64 z5, z10, z11, z12, z13; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + JCOEF *wsptr; + JCOEF workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *)dct_table; + wsptr = workspace; + + DO_IDCT_PASS1(1) +nextcolumn1: + inptr += 4; + quantptr += 4; + wsptr += DCTSIZE * 4; + DO_IDCT_PASS1(2) +nextcolumn2: + + /* Pass 2: process rows. */ + + wsptr = workspace; + + DO_IDCT_PASS2(0) + wsptr += 4; + DO_IDCT_PASS2(4) +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c new file mode 100644 index 0000000000..cd3db980c5 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c @@ -0,0 +1,571 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCUATE INTEGER INVERSE DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) +#define CENTERJSAMPLE 128 + +#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ +#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ +#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ +#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ +#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ +#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ +#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ +#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ +#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ +#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ +#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ +#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ + +enum const_index { + index_PW_F130_F054, + index_PW_F054_MF130, + index_PW_MF078_F117, + index_PW_F117_F078, + index_PW_MF060_MF089, + index_PW_MF089_F060, + index_PW_MF050_MF256, + index_PW_MF256_F050, + index_PD_DESCALE_P1, + index_PD_DESCALE_P2, + index_PB_CENTERJSAMP +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), + FIX_0_541, (FIX_0_541 + FIX_0_765)), + _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, + (FIX_0_541 - FIX_1_847), FIX_0_541), + _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), + FIX_1_175, (FIX_1_175 - FIX_1_961)), + _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, + (FIX_1_175 - FIX_0_390), FIX_1_175), + _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), + -FIX_0_899, (FIX_0_298 - FIX_0_899)), + _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, + (FIX_1_501 - FIX_0_899), -FIX_0_899), + _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), + -FIX_2_562, (FIX_2_053 - FIX_2_562)), + _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, + (FIX_3_072 - FIX_2_562), -FIX_2_562), + _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), + _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), + _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, + CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE) +}; + +#define PW_F130_F054 get_const_value(index_PW_F130_F054) +#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) +#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) +#define PW_F117_F078 get_const_value(index_PW_F117_F078) +#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) +#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) +#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) +#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) +#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) +#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) +#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP) + + +#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32)) +#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64)) + + +#define DO_IDCT_COMMON(PASS) { \ + __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ + __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \ + __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \ + \ + z3 = _mm_add_pi16(tmp0, tmp2); \ + z4 = _mm_add_pi16(tmp1, tmp3); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = _mm_unpacklo_pi16(z3, z4); \ + z34h = _mm_unpackhi_pi16(z3, z4); \ + z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ + z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ + z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ + z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \ + tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \ + \ + tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \ + tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \ + tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \ + tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \ + \ + tmp0l = _mm_add_pi32(tmp0l, z3l); \ + tmp0h = _mm_add_pi32(tmp0h, z3h); \ + tmp3l = _mm_add_pi32(tmp3l, z4l); \ + tmp3h = _mm_add_pi32(tmp3h, z4h); \ + \ + tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \ + tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \ + \ + tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \ + tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \ + tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \ + tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \ + \ + tmp1l = _mm_add_pi32(tmp1l, z4l); \ + tmp1h = _mm_add_pi32(tmp1h, z4h); \ + tmp2l = _mm_add_pi32(tmp2l, z3l); \ + tmp2h = _mm_add_pi32(tmp2h, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = _mm_add_pi32(tmp10l, tmp3l); \ + out0h = _mm_add_pi32(tmp10h, tmp3h); \ + out7l = _mm_sub_pi32(tmp10l, tmp3l); \ + out7h = _mm_sub_pi32(tmp10h, tmp3h); \ + \ + out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \ + out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \ + out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \ + out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \ + \ + out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ + out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ + out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ + out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ + \ + out0 = _mm_packs_pi32(out0l, out0h); \ + out7 = _mm_packs_pi32(out7l, out7h); \ + \ + out1l = _mm_add_pi32(tmp11l, tmp2l); \ + out1h = _mm_add_pi32(tmp11h, tmp2h); \ + out6l = _mm_sub_pi32(tmp11l, tmp2l); \ + out6h = _mm_sub_pi32(tmp11h, tmp2h); \ + \ + out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ + out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ + out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ + out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ + \ + out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ + out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ + out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ + out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ + \ + out1 = _mm_packs_pi32(out1l, out1h); \ + out6 = _mm_packs_pi32(out6l, out6h); \ + \ + out2l = _mm_add_pi32(tmp12l, tmp1l); \ + out2h = _mm_add_pi32(tmp12h, tmp1h); \ + out5l = _mm_sub_pi32(tmp12l, tmp1l); \ + out5h = _mm_sub_pi32(tmp12h, tmp1h); \ + \ + out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ + out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ + out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ + out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ + \ + out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ + out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ + out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ + out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ + \ + out2 = _mm_packs_pi32(out2l, out2h); \ + out5 = _mm_packs_pi32(out5l, out5h); \ + \ + out3l = _mm_add_pi32(tmp13l, tmp0l); \ + out3h = _mm_add_pi32(tmp13h, tmp0h); \ + \ + out4l = _mm_sub_pi32(tmp13l, tmp0l); \ + out4h = _mm_sub_pi32(tmp13h, tmp0h); \ + \ + out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ + out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ + out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ + out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ + \ + out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \ + out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \ + out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \ + out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \ + \ + out3 = _mm_packs_pi32(out3l, out3h); \ + out4 = _mm_packs_pi32(out4l, out4h); \ +} + +#define DO_IDCT_PASS1(iter) { \ + __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \ + __m64 quant0l, quant1l, quant2l, quant3l; \ + __m64 quant4l, quant5l, quant6l, quant7l; \ + __m64 z23, z2, z3, z23l, z23h; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ + __m32 col0a, col1a, mm0; \ + \ + col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \ + col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \ + mm0 = _mm_or_si32(col0a, col1a); \ + \ + if (test_m32_zero(mm0)) { \ + __m64 mm1, mm2; \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \ + \ + mm1 = _mm_or_si64(col1l, col3l); \ + mm2 = _mm_or_si64(col2l, col4l); \ + mm1 = _mm_or_si64(mm1, col5l); \ + mm2 = _mm_or_si64(mm2, col6l); \ + mm1 = _mm_or_si64(mm1, col7l); \ + mm1 = _mm_or_si64(mm1, mm2); \ + \ + if (test_m64_zero(mm1)) { \ + __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \ + \ + /* AC terms all zero */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + \ + dcval = _mm_mullo_pi16(col0l, quant0l); \ + dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \ + \ + dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ + dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ + \ + row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ + row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ + row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ + row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \ + \ + goto nextcolumn##iter; \ + } \ + } \ + \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \ + quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \ + quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \ + \ + z2 = _mm_mullo_pi16(col2l, quant2l); \ + z3 = _mm_mullo_pi16(col6l, quant6l); \ + \ + z23l = _mm_unpacklo_pi16(z2, z3); \ + z23h = _mm_unpackhi_pi16(z2, z3); \ + tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ + tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ + tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ + tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ + \ + z2 = _mm_mullo_pi16(col0l, quant0l); \ + z3 = _mm_mullo_pi16(col4l, quant4l); \ + \ + z23 = _mm_add_pi16(z2, z3); \ + tmp0l = _mm_loadlo_pi16_f(z23); \ + tmp0h = _mm_loadhi_pi16_f(z23); \ + tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ + tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ + \ + tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ + tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ + tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ + tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ + \ + z23 = _mm_sub_pi16(z2, z3); \ + tmp1l = _mm_loadlo_pi16_f(z23); \ + tmp1h = _mm_loadhi_pi16_f(z23); \ + tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ + tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ + \ + tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ + tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ + tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ + tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \ + \ + quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \ + quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \ + quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \ + quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \ + \ + tmp0 = _mm_mullo_pi16(col7l, quant7l); \ + tmp1 = _mm_mullo_pi16(col5l, quant5l); \ + tmp2 = _mm_mullo_pi16(col3l, quant3l); \ + tmp3 = _mm_mullo_pi16(col1l, quant1l); \ + \ + DO_IDCT_COMMON(1) \ + \ + /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ + /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ + /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ + /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ + row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ + row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ + row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ + \ + row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ + row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ + row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ + row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ + \ + row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ + row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ + row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ + row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ + \ + row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ + row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ + row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ + row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \ +} + +#define DO_IDCT_PASS2(ctr) { \ + __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \ + __m64 z23, z23l, z23h; \ + __m64 col0123a, col0123b, col0123c, col0123d; \ + __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \ + __m64 col0, col1, col2, col3; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ + \ + row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \ + row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \ + row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \ + row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \ + \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + z23l = _mm_unpacklo_pi16(row2l, row6l); \ + z23h = _mm_unpackhi_pi16(row2l, row6l); \ + \ + tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ + tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ + tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ + tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ + \ + z23 = _mm_add_pi16(row0l, row4l); \ + tmp0l = _mm_loadlo_pi16_f(z23); \ + tmp0h = _mm_loadhi_pi16_f(z23); \ + tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ + tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ + \ + tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ + tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ + tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ + tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ + \ + z23 = _mm_sub_pi16(row0l, row4l); \ + tmp1l = _mm_loadlo_pi16_f(z23); \ + tmp1h = _mm_loadhi_pi16_f(z23); \ + tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ + tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ + \ + tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ + tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ + tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ + tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + tmp0 = row7l; \ + tmp1 = row5l; \ + tmp2 = row3l; \ + tmp3 = row1l; \ + \ + DO_IDCT_COMMON(2) \ + \ + /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ + /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ + /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ + /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ + \ + row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ + row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ + row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ + row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ + \ + row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ + row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ + row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \ + row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \ + \ + /* Transpose coefficients */ \ + \ + col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ + col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ + col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ + col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ + \ + col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ + col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ + col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ + col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ + \ + col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ + col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ + col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ + col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ + \ + _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \ +} + +void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + __m64 tmp0, tmp1, tmp2, tmp3; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + JCOEF *wsptr; + JCOEF workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *)dct_table; + wsptr = workspace; + + DO_IDCT_PASS1(1) +nextcolumn1: + inptr += 4; + quantptr += 4; + wsptr += DCTSIZE * 4; + DO_IDCT_PASS1(2) +nextcolumn2: + + /* Pass 2: process rows. */ + + wsptr = workspace; + + DO_IDCT_PASS2(0) + wsptr += 4; + DO_IDCT_PASS2(4) +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c new file mode 100644 index 0000000000..339002fd80 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c @@ -0,0 +1,124 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved. + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_mmi.h" + + +#define DO_QUANT() { \ + __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \ + __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \ + \ + rowl = _mm_load_si64((__m64 *)&workspace[0]); \ + rowh = _mm_load_si64((__m64 *)&workspace[4]); \ + \ + /* Branch-less absolute value */ \ + rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1)); /* -1 if value < 0, */ \ + /* 0 otherwise */ \ + rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \ + \ + rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \ + rowh = _mm_xor_si64(rowh, rowhs); \ + rowl = _mm_sub_pi16(rowl, rowls); \ + rowh = _mm_sub_pi16(rowh, rowhs); \ + \ + corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \ + corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \ + \ + rowlsave = rowl = _mm_add_pi16(rowl, corrl); /* correction + roundfactor */ \ + rowhsave = rowh = _mm_add_pi16(rowh, corrh); \ + \ + recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \ + reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \ + \ + rowl = _mm_mulhi_pi16(rowl, recipl); \ + rowh = _mm_mulhi_pi16(rowh, reciph); \ + \ + /* reciprocal is always negative (MSB=1), so we always need to add the */ \ + /* initial value (input value is never negative as we inverted it at the */ \ + /* start of this routine) */ \ + rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \ + rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \ + \ + scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \ + scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \ + \ + rowl = _mm_mulhi_pi16(rowl, scalel); \ + rowh = _mm_mulhi_pi16(rowh, scaleh); \ + \ + /* determine if scale is negative */ \ + scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \ + scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \ + \ + /* and add input if it is */ \ + scalel = _mm_and_si64(scalel, rowlsave); \ + scaleh = _mm_and_si64(scaleh, rowhsave); \ + rowl = _mm_add_pi16(rowl, scalel); \ + rowh = _mm_add_pi16(rowh, scaleh); \ + \ + /* then check if negative input */ \ + rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \ + rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \ + \ + /* and add scale if it is */ \ + rowlsave = _mm_and_si64(rowlsave, scalel); \ + rowhsave = _mm_and_si64(rowhsave, scaleh); \ + rowl = _mm_add_pi16(rowl, rowlsave); \ + rowh = _mm_add_pi16(rowh, rowhsave); \ + \ + rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \ + rowh = _mm_xor_si64(rowh, rowhs); \ + rowl = _mm_sub_pi16(rowl, rowls); \ + rowh = _mm_sub_pi16(rowh, rowhs); \ + \ + _mm_store_si64((__m64 *)&output_ptr[0], rowl); \ + _mm_store_si64((__m64 *)&output_ptr[4], rowh); \ + \ + workspace += DCTSIZE; \ + divisors += DCTSIZE; \ + output_ptr += DCTSIZE; \ +} + + +void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + JCOEFPTR output_ptr = coef_block; + + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c new file mode 100644 index 0000000000..e8f1af562b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c @@ -0,0 +1,870 @@ +/* + * jsimd_mips64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, 2018, Matthieu Darbois. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit MIPS architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include +#include +#include + +static unsigned int simd_support = ~0; + +#if defined(__linux__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature(char *buffer, char *feature) +{ + char *p; + + if (*feature == 0) + return 0; + if (strncmp(buffer, "ASEs implemented", 16) != 0) + return 0; + buffer += 16; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "loongson-mmi")) + simd_support |= JSIMD_MMI; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif +#if defined(__linux__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__linux__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#elif defined(__mips_loongson_vector_rev) + /* Only enable MMI by default on non-Linux platforms when the compiler flags + * support it. */ + simd_support |= JSIMD_MMI; +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEMMI"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_MMI; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_extrgb_ycc_convert_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_extrgbx_ycc_convert_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_extbgr_ycc_convert_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_extbgrx_ycc_convert_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_extxbgr_ycc_convert_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_extxrgb_ycc_convert_mmi; + break; + default: + mmifct = jsimd_rgb_ycc_convert_mmi; + break; + } + + mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_extrgb_gray_convert_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_extrgbx_gray_convert_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_extbgr_gray_convert_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_extbgrx_gray_convert_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_extxbgr_gray_convert_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_extxrgb_gray_convert_mmi; + break; + default: + mmifct = jsimd_rgb_gray_convert_mmi; + break; + } + + mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_ycc_extrgb_convert_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_ycc_extrgbx_convert_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_ycc_extbgr_convert_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_ycc_extbgrx_convert_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_ycc_extxbgr_convert_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_ycc_extxrgb_convert_mmi; + break; + default: + mmifct = jsimd_ycc_rgb_convert_mmi; + break; + } + + mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi; + break; + default: + mmifct = jsimd_h2v2_merged_upsample_mmi; + break; + } + + mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi; + break; + default: + mmifct = jsimd_h2v1_merged_upsample_mmi; + break; + } + + mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_mmi(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_mmi(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_mmi(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h new file mode 100644 index 0000000000..5e4261c9d9 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h @@ -0,0 +1,69 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen + * CaiWanwei + * SunZhangzhi + * QingfaLiu + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jdct.h" +#include "loongson-mmintrin.h" + + +/* Common code */ +#if defined(_ABI64) && _MIPS_SIM == _ABI64 +# define PTR_ADDU "daddu " +# define PTR_SLL "dsll " +#else +# define PTR_ADDU "addu " +# define PTR_SLL "sll " +#endif + +#define SIZEOF_MMWORD 8 +#define BYTE_BIT 8 +#define WORD_BIT 16 +#define SCALEBITS 16 + +#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \ + (((uint64_t)(uint8_t)a << 56) | \ + ((uint64_t)(uint8_t)b << 48) | \ + ((uint64_t)(uint8_t)c << 40) | \ + ((uint64_t)(uint8_t)d << 32) | \ + ((uint64_t)(uint8_t)e << 24) | \ + ((uint64_t)(uint8_t)f << 16) | \ + ((uint64_t)(uint8_t)g << 8) | \ + ((uint64_t)(uint8_t)h)) +#define _uint64_set1_pi8(a) _uint64_set_pi8(a, a, a, a, a, a, a, a) +#define _uint64_set_pi16(a, b, c, d) \ + (((uint64_t)(uint16_t)a << 48) | \ + ((uint64_t)(uint16_t)b << 32) | \ + ((uint64_t)(uint16_t)c << 16) | \ + ((uint64_t)(uint16_t)d)) +#define _uint64_set1_pi16(a) _uint64_set_pi16(a, a, a, a) +#define _uint64_set_pi32(a, b) \ + (((uint64_t)(uint32_t)a << 32) | \ + ((uint64_t)(uint32_t)b)) + +#define get_const_value(index) (*(__m64 *)&const_value[index]) diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h b/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h new file mode 100644 index 0000000000..db9b35ab60 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h @@ -0,0 +1,1334 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Copyright (C) 2019, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#ifndef __LOONGSON_MMINTRIN_H__ +#define __LOONGSON_MMINTRIN_H__ + +#include + + +#define FUNCTION_ATTRIBS \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + + +/* Vectors are stored in 64-bit floating-point registers. */ +typedef double __m64; + +/* Having a 32-bit datatype allows us to use 32-bit loads in places like + load8888. */ +typedef float __m32; + + +/********** Set Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setzero_si64(void) +{ + return 0.0; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4, + uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0) +{ + __m64 ret; + uint32_t lo = ((uint32_t)__b6 << 24) | + ((uint32_t)__b4 << 16) | + ((uint32_t)__b2 << 8) | + (uint32_t)__b0; + uint32_t hi = ((uint32_t)__b7 << 24) | + ((uint32_t)__b5 << 16) | + ((uint32_t)__b3 << 8) | + (uint32_t)__b1; + + asm("mtc1 %1, %0\n\t" + "mtc1 %2, $f0\n\t" + "punpcklbh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (lo), "r" (hi) + : "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0) +{ + __m64 ret; + uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0; + uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1; + + asm("mtc1 %1, %0\n\t" + "mtc1 %2, $f0\n\t" + "punpcklhw %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (lo), "r" (hi) + : "$f0" + ); + + return ret; +} + +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi32(uint32_t __i1, uint32_t __i0) +{ + if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) { + uint64_t val = ((uint64_t)__i1 << 32) | + ((uint64_t)__i0 << 0); + + return *(__m64 *)&val; + } else if (__i1 == __i0) { + uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0); + __m64 ret; + + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) + ); + + return ret; + } else { + uint64_t val = ((uint64_t)__i1 << 32) | + ((uint64_t)__i0 << 0); + + return *(__m64 *)&val; + } +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi8(uint8_t __b0) +{ + __m64 ret; + + asm("sll $8, %1, 8\n\t" + "or %1, %1, $8\n\t" + "mtc1 %1, %0\n\t" + "mtc1 $0, $f0\n\t" + "pshufh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (__b0) + : "$8", "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi16(uint16_t __h0) +{ + __m64 ret; + + asm("mtc1 %1, %0\n\t" + "mtc1 $0, $f0\n\t" + "pshufh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (__h0) + : "$8", "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi32(unsigned __i0) +{ + return _mm_set_pi32(__i0, __i0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3, + uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7) +{ + return _mm_set_pi8(__h7, __h6, __h5, __h4, + __h3, __h2, __h1, __h0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3) +{ + return _mm_set_pi16(__w3, __w2, __w1, __w0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi32(uint32_t __i0, uint32_t __i1) +{ + return _mm_set_pi32(__i1, __i0); +} + + +/********** Arithmetic Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddsb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_avg_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pavgb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_avg_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pavgh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_madd_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaddhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_max_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaxsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_max_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaxub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_min_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pminsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_min_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pminub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline int FUNCTION_ATTRIBS +_mm_movemask_pi8(__m64 __m1) +{ + int ret; + + asm("pmovmskb %0, %1\n\t" + : "=r" (ret) + : "y" (__m1) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mulhi_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmulhh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mulhi_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmulhuh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mullo_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmullh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mul_pu32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmuluw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sad_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psadbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_asub_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pasubub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_biadd_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("biadd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubsb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Logical Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_and_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("and %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_andnot_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("andn %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_or_si32(__m32 __m1, __m32 __m2) +{ + __m32 ret; + + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_or_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_xor_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("xor %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Shift Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psllh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psllw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsll %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrlh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrlw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsrl %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrah %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psraw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsra %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + + +/********** Conversion Intrinsics **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +to_m64(uint64_t x) +{ + return *(__m64 *)&x; +} + +extern __inline uint64_t FUNCTION_ATTRIBS +to_uint64(__m64 x) +{ + return *(uint64_t *)&x; +} + + +/********** Comparison Intrinsics **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgtb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgtw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpltb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmplth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpltw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Miscellaneous Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsshb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi32_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packushb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_extract_pi16(__m64 __m, int64_t __pos) +{ + __m64 ret; + + asm("pextrh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__pos) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos) +{ + __m64 ret; + + switch (__pos) { + case 0: + + asm("pinsrh_0 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + + case 1: + + asm("pinsrh_1 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + case 2: + + asm("pinsrh_2 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + + case 3: + + asm("pinsrh_3 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + } + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_shuffle_pi16(__m64 __m, int64_t __n) +{ + __m64 ret; + + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__n) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype, + which preserves the data. */ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32, + datatype, which allows load8888 to use 32-bit loads. */ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline void FUNCTION_ATTRIBS +_mm_store_pi32(__m32 *dest, __m64 src) +{ + src = _mm_packs_pu16(src, _mm_setzero_si64()); + + asm("swc1 %1, %0\n\t" + : "=m" (*dest) + : "f" (src) + : "memory" + ); +} + +extern __inline void FUNCTION_ATTRIBS +_mm_store_si64(__m64 *dest, __m64 src) +{ + asm("sdc1 %1, %0 \n\t" + : "=m" (*dest) + : "f" (src) + : "memory" + ); +} + +extern __inline void FUNCTION_ATTRIBS +_mm_storeu_si64(__m64 *dest, __m64 src) +{ + asm("gssdlc1 %1, 7(%0) \n\t" + "gssdrc1 %1, 0(%0) \n\t" + : + : "r" (dest), "f" (src) + : "memory" + ); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_load_si32(const __m32 *src) +{ + __m32 ret; + + asm("lwc1 %0, %1\n\t" + : "=f" (ret) + : "m" (*src) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_load_si64(const __m64 *src) +{ + __m64 ret; + + asm("ldc1 %0, %1\n\t" + : "=f" (ret) + : "m" (*src) + : "memory" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadu_si64(const __m64 *src) +{ + __m64 ret; + + asm("gsldlc1 %0, 7(%1)\n\t" + "gsldrc1 %0, 0(%1)\n\t" + : "=f" (ret) + : "r" (src) + : "memory" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi8(const uint32_t *src) +{ + return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi8_f(__m64 src) +{ + return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi8_f(__m64 src) +{ + return _mm_unpackhi_pi8_f(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi16(__m64 src) +{ + return _mm_unpacklo_pi16(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi16_f(__m64 src) +{ + return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi16(__m64 src) +{ + return _mm_unpackhi_pi16(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi16_f(__m64 src) +{ + return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_expand_alpha(__m64 pixel) +{ + return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3)); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_expand_alpha_rev(__m64 pixel) +{ + return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0)); +} + +#endif /* __LOONGSON_MMINTRIN_H__ */ diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc new file mode 100644 index 0000000000..6f6d7f29d1 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc @@ -0,0 +1,135 @@ +; +; jcolsamp.inc - private declarations for color conversion & up/downsampling +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; -------------------------------------------------------------------------- + +; pseudo-resisters to make ordering of RGB configurable +; +%if RGB_RED == 0 +%define mmA mm0 +%define mmB mm1 +%define xmmA xmm0 +%define xmmB xmm1 +%define ymmA ymm0 +%define ymmB ymm1 +%elif RGB_GREEN == 0 +%define mmA mm2 +%define mmB mm3 +%define xmmA xmm2 +%define xmmB xmm3 +%define ymmA ymm2 +%define ymmB ymm3 +%elif RGB_BLUE == 0 +%define mmA mm4 +%define mmB mm5 +%define xmmA xmm4 +%define xmmB xmm5 +%define ymmA ymm4 +%define ymmB ymm5 +%else +%define mmA mm6 +%define mmB mm7 +%define xmmA xmm6 +%define xmmB xmm7 +%define ymmA ymm6 +%define ymmB ymm7 +%endif + +%if RGB_RED == 1 +%define mmC mm0 +%define mmD mm1 +%define xmmC xmm0 +%define xmmD xmm1 +%define ymmC ymm0 +%define ymmD ymm1 +%elif RGB_GREEN == 1 +%define mmC mm2 +%define mmD mm3 +%define xmmC xmm2 +%define xmmD xmm3 +%define ymmC ymm2 +%define ymmD ymm3 +%elif RGB_BLUE == 1 +%define mmC mm4 +%define mmD mm5 +%define xmmC xmm4 +%define xmmD xmm5 +%define ymmC ymm4 +%define ymmD ymm5 +%else +%define mmC mm6 +%define mmD mm7 +%define xmmC xmm6 +%define xmmD xmm7 +%define ymmC ymm6 +%define ymmD ymm7 +%endif + +%if RGB_RED == 2 +%define mmE mm0 +%define mmF mm1 +%define xmmE xmm0 +%define xmmF xmm1 +%define ymmE ymm0 +%define ymmF ymm1 +%elif RGB_GREEN == 2 +%define mmE mm2 +%define mmF mm3 +%define xmmE xmm2 +%define xmmF xmm3 +%define ymmE ymm2 +%define ymmF ymm3 +%elif RGB_BLUE == 2 +%define mmE mm4 +%define mmF mm5 +%define xmmE xmm4 +%define xmmF xmm5 +%define ymmE ymm4 +%define ymmF ymm5 +%else +%define mmE mm6 +%define mmF mm7 +%define xmmE xmm6 +%define xmmF xmm7 +%define ymmE ymm6 +%define ymmF ymm7 +%endif + +%if RGB_RED == 3 +%define mmG mm0 +%define mmH mm1 +%define xmmG xmm0 +%define xmmH xmm1 +%define ymmG ymm0 +%define ymmH ymm1 +%elif RGB_GREEN == 3 +%define mmG mm2 +%define mmH mm3 +%define xmmG xmm2 +%define xmmH xmm3 +%define ymmG ymm2 +%define ymmH ymm3 +%elif RGB_BLUE == 3 +%define mmG mm4 +%define mmH mm5 +%define xmmG xmm4 +%define xmmH xmm5 +%define ymmG ymm4 +%define ymmH ymm5 +%else +%define mmG mm6 +%define mmH mm7 +%define xmmG xmm6 +%define xmmH xmm7 +%define ymmG ymm6 +%define ymmH ymm7 +%endif + +; -------------------------------------------------------------------------- diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc new file mode 100644 index 0000000000..9192f66f0c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc @@ -0,0 +1,31 @@ +; +; jdct.inc - private declarations for forward & reverse DCT subsystems +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2018, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; Each IDCT routine is responsible for range-limiting its results and +; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could +; be quite far out of range if the input data is corrupt, so a bulletproof +; range-limiting step is required. We use a mask-and-table-lookup method +; to do the combined operations quickly. +; +%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples + +%define ROW(n, b, s) ((b) + (n) * (s)) +%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE) + +%define DWBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD) +%define MMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD) +%define XMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD) +%define YMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD) + +; -------------------------------------------------------------------------- diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc new file mode 100644 index 0000000000..667024a5f9 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc @@ -0,0 +1,93 @@ +; +; Automatically generated include file from jsimdcfg.inc.h +; +; +; -- jpeglib.h +; +%define DCTSIZE 8 +%define DCTSIZE2 64 +; +; -- jmorecfg.h +; +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define EXT_RGB_RED 0 +%define EXT_RGB_GREEN 1 +%define EXT_RGB_BLUE 2 +%define EXT_RGB_PIXELSIZE 3 +%define EXT_RGBX_RED 0 +%define EXT_RGBX_GREEN 1 +%define EXT_RGBX_BLUE 2 +%define EXT_RGBX_PIXELSIZE 4 +%define EXT_BGR_RED 2 +%define EXT_BGR_GREEN 1 +%define EXT_BGR_BLUE 0 +%define EXT_BGR_PIXELSIZE 3 +%define EXT_BGRX_RED 2 +%define EXT_BGRX_GREEN 1 +%define EXT_BGRX_BLUE 0 +%define EXT_BGRX_PIXELSIZE 4 +%define EXT_XBGR_RED 3 +%define EXT_XBGR_GREEN 2 +%define EXT_XBGR_BLUE 1 +%define EXT_XBGR_PIXELSIZE 4 +%define EXT_XRGB_RED 1 +%define EXT_XRGB_GREEN 2 +%define EXT_XRGB_BLUE 3 +%define EXT_XRGB_PIXELSIZE 4 +%define RGBX_FILLER_0XFF 1 +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) +%define CENTERJSAMPLE 128 +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) +; +; -- jdct.h +; +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) +%define float FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float) +; To maximize parallelism, Type short is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) +; +; -- jsimd.h +; +%define JSIMD_NONE 0x00 +%define JSIMD_MMX 0x01 +%define JSIMD_3DNOW 0x02 +%define JSIMD_SSE 0x04 +%define JSIMD_SSE2 0x08 +%define JSIMD_AVX2 0x80 diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h new file mode 100644 index 0000000000..bf2a45ad50 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h @@ -0,0 +1,133 @@ +/* + * This file generates the include file for the assembly + * implementations by abusing the C preprocessor. + * + * Note: Some things are manually defined as they need to + * be mapped to NASM types. + */ + +; +; Automatically generated include file from jsimdcfg.inc.h +; + +#define JPEG_INTERNALS + +#include "../jpeglib.h" +#include "../jconfig.h" +#include "../jmorecfg.h" +#include "jsimd.h" + +; +; -- jpeglib.h +; + +%define _cpp_protection_DCTSIZE DCTSIZE +%define _cpp_protection_DCTSIZE2 DCTSIZE2 + +; +; -- jmorecfg.h +; + +%define _cpp_protection_RGB_RED RGB_RED +%define _cpp_protection_RGB_GREEN RGB_GREEN +%define _cpp_protection_RGB_BLUE RGB_BLUE +%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED +%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN +%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE +%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED +%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN +%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE +%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE + +%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED +%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN +%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE +%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE + +%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED +%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN +%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE +%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE + +%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED +%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN +%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE +%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE + +%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED +%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN +%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE +%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE + +%define RGBX_FILLER_0XFF 1 + +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; + +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) + +%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE + +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) + +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) + +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) + +; +; -- jdct.h +; + +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) + +%define FAST_FLOAT FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) + +; To maximize parallelism, Type MULTIPLIER is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) + +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors + +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) + +; +; -- jsimd.h +; + +%define _cpp_protection_JSIMD_NONE JSIMD_NONE +%define _cpp_protection_JSIMD_MMX JSIMD_MMX +%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW +%define _cpp_protection_JSIMD_SSE JSIMD_SSE +%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 +%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2 diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc new file mode 100644 index 0000000000..e8d50b0349 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc @@ -0,0 +1,520 @@ +; +; jsimdext.inc - common declarations +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. +; Copyright (C) 2018, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 +; +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +; ========================================================================== +; System-dependent configurations + +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +; * Microsoft Visual C++ +; * MinGW (Minimalist GNU for Windows) +; * CygWin +; * LCC-Win32 + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use32 class=CODE +%define SEG_CONST .rdata align=32 public use32 class=CONST +%endif + +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +; * Microsoft Visual C++ + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use64 class=CODE +%define SEG_CONST .rdata align=32 public use64 class=CONST +%endif +%define EXTN(name) name ; foo() -> foo + +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +; * Borland C++ (Win32) + +; -- segment definition -- +; +%define SEG_TEXT _text align=32 public use32 class=CODE +%define SEG_CONST _data align=32 public use32 class=DATA + +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +; * Linux +; * *BSD family Unix using elf format +; * Unix System V, including Solaris x86, UnixWare and SCO Unix + +; mark stack as non-executable +section .note.GNU-stack noalloc noexec nowrite progbits + +; -- segment definition -- +; +%ifdef __x86_64__ +%define SEG_TEXT .text progbits align=32 +%define SEG_CONST .rodata progbits align=32 +%else +%define SEG_TEXT .text progbits alloc exec nowrite align=32 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 +%endif + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo + +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +; * Older Linux using a.out format (nasm -f aout -DAOUT ...) +; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC + +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) + +; -- segment definition -- +; +%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? +%define SEG_CONST .rodata align=32 + +; The generation of position-independent code (PIC) is the default on Darwin. +; +%define PIC +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing + +%else ; ----(Other case)---------------------- + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +%endif ; ---------------------------------------------- + +; ========================================================================== + +; -------------------------------------------------------------------------- +; Common types +; +%ifdef __x86_64__ +%ifnidn __OUTPUT_FORMAT__, elfx32 +%define POINTER qword ; general pointer type +%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) +%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resq +%define dp dq +%define raxp rax +%define rbxp rbx +%define rcxp rcx +%define rdxp rdx +%define rsip rsi +%define rdip rdi +%define rbpp rbp +%define rspp rsp +%define r8p r8 +%define r9p r9 +%define r10p r10 +%define r11p r11 +%define r12p r12 +%define r13p r13 +%define r14p r14 +%define r15p r15 +%endif +%endif +%ifndef raxp +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resd +%define dp dd +; x86_64 ILP32 ABI (x32) +%define raxp eax +%define rbxp ebx +%define rcxp ecx +%define rdxp edx +%define rsip esi +%define rdip edi +%define rbpp ebp +%define rspp esp +%define r8p r8d +%define r9p r9d +%define r10p r10d +%define r11p r11d +%define r12p r12d +%define r13p r13d +%define r14p r14d +%define r15p r15d +%endif + +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT + +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT + +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT + +; NASM is buggy and doesn't properly handle operand sizes for SSE +; instructions, so for now we have to define XMMWORD as blank. +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT + +%define YMMWORD ; int256 (AVX register) +%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) +%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT + +; Similar hacks for when we load a dword or MMWORD into an xmm# register +%define XMM_DWORD +%define XMM_MMWORD + +%define SIZEOF_BYTE 1 ; sizeof(byte) +%define SIZEOF_WORD 2 ; sizeof(word) +%define SIZEOF_DWORD 4 ; sizeof(dword) +%define SIZEOF_QWORD 8 ; sizeof(qword) +%define SIZEOF_OWORD 16 ; sizeof(oword) +%define SIZEOF_YWORD 32 ; sizeof(yword) + +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT +%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT + +; -------------------------------------------------------------------------- +; External Symbol Name +; +%ifndef EXTN +%define EXTN(name) _ %+ name ; foo() -> _foo +%endif + +; -------------------------------------------------------------------------- +; Hidden symbols +; +%ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- +%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden +%define GLOBAL_DATA(name) global EXTN(name):data hidden +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +%ifdef __YASM_VER__ +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%else +%if __NASM_VERSION_ID__ >= 0x020E0000 +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%endif +%endif +%endif + +%ifndef GLOBAL_FUNCTION +%define GLOBAL_FUNCTION(name) global EXTN(name) +%endif +%ifndef GLOBAL_DATA +%define GLOBAL_DATA(name) global EXTN(name) +%endif + +; -------------------------------------------------------------------------- +; Macros for position-independent code (PIC) support +; +%ifndef GOT_SYMBOL +%undef PIC +%endif + +%ifdef PIC ; ------------------------------------------- + +%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- + +; At present, nasm doesn't seem to support PIC generation for Mach-O. +; The PIC support code below is a little tricky. + + SECTION SEG_CONST +const_base: + +%define GOTOFF(got, sym) (got) + (sym) - const_base + +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust +%%geteip: + mov ecx, POINTER [esp] + ret +%%adjust: + push ebp + xor ebp, ebp ; ebp = 0 +%ifidni %1, ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D, 0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: +%else ; (%1 != ebx) + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D, 0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: + mov %1, ecx +%endif ; (%1 == ebx) + pop ebp +%endmacro + +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- + +%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff + +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done +%%geteip: + mov %1, POINTER [esp] + ret +%%done: +%endmacro + +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- + +%imacro pushpic 1.nolist + push %1 +%endmacro +%imacro poppic 1.nolist + pop %1 +%endmacro +%imacro movpic 2.nolist + mov %1, %2 +%endmacro + +%else ; !PIC ----------------------------------------- + +%define GOTOFF(got, sym) (sym) + +%imacro get_GOT 1.nolist +%endmacro +%imacro pushpic 1.nolist +%endmacro +%imacro poppic 1.nolist +%endmacro +%imacro movpic 2.nolist +%endmacro + +%endif ; PIC ----------------------------------------- + +; -------------------------------------------------------------------------- +; Align the next instruction on {2,4,8,16,..}-byte boundary. +; ".balign n,,m" in GNU as +; +%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) +%define FILLB(b, n) (($$-(b)) & ((n)-1)) + +%imacro alignx 1-2.nolist 0xFFFF +%%bs: \ + times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ + db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ + db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ + db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ + db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ + db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ + db 0x8B, 0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ + db 0x90 ; nop +%endmacro + +; Align the next data on {2,4,8,16,..}-byte boundary. +; +%imacro alignz 1.nolist + align %1, db 0 ; filling zeros +%endmacro + +%ifdef __x86_64__ + +%ifdef WIN64 + +%imacro collect_args 1 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 + mov r10, rcx +%if %1 > 1 + mov r11, rdx +%endif +%if %1 > 2 + push r12 + mov r12, r8 +%endif +%if %1 > 3 + push r13 + mov r13, r9 +%endif +%if %1 > 4 + push r14 + mov r14, [rax+48] +%endif +%if %1 > 5 + push r15 + mov r15, [rax+56] +%endif + push rsi + push rdi +%endmacro + +%imacro uncollect_args 1 + pop rdi + pop rsi +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD +%endmacro + +%imacro push_xmm 1 + sub rsp, %1 * SIZEOF_XMMWORD + movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 +%if %1 > 1 + movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 +%endif +%if %1 > 2 + movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 +%endif +%if %1 > 3 + movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 +%endif +%endmacro + +%imacro pop_xmm 1 + movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] +%if %1 > 1 + movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] +%endif +%if %1 > 2 + movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] +%endif +%if %1 > 3 + movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] +%endif + add rsp, %1 * SIZEOF_XMMWORD +%endmacro + +%else + +%imacro collect_args 1 + push r10 + mov r10, rdi +%if %1 > 1 + push r11 + mov r11, rsi +%endif +%if %1 > 2 + push r12 + mov r12, rdx +%endif +%if %1 > 3 + push r13 + mov r13, rcx +%endif +%if %1 > 4 + push r14 + mov r14, r8 +%endif +%if %1 > 5 + push r15 + mov r15, r9 +%endif +%endmacro + +%imacro uncollect_args 1 +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif +%if %1 > 1 + pop r11 +%endif + pop r10 +%endmacro + +%imacro push_xmm 1 +%endmacro + +%imacro pop_xmm 1 +%endmacro + +%endif + +%endif + +; -------------------------------------------------------------------------- +; Defines picked up from the C headers +; +%include "jsimdcfg.inc" + +; -------------------------------------------------------------------------- diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c new file mode 100644 index 0000000000..170f90ff80 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c @@ -0,0 +1,269 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-altivec.c */ + + +void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, + rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = { 0 }; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = { 0 }; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh, crl, crh, cbl, cbh; + __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, + pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, + pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; + __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }, + pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr0 += 16, outptr1 += 16, outptr2 += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = VEC_LD(0, tmpbuf); + rgb1 = VEC_LD(16, tmpbuf); + rgb2 = VEC_LD(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = VEC_LD(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = VEC_LD(0, inptr); + if (num_cols > 16) + rgb1 = VEC_LD(16, inptr); + if (num_cols > 32) + rgb2 = VEC_LD(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = VEC_LD(48, inptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr0); + + /* Calculate Cb values */ + cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); + cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); + cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); + cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); + cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, + (__vector unsigned int)cb0); + cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, + (__vector unsigned int)cb1); + cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, + (__vector unsigned int)cb2); + cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, + (__vector unsigned int)cb3); + cbl = vec_perm((__vector unsigned short)cb0, + (__vector unsigned short)cb1, shift_pack_index); + cbh = vec_perm((__vector unsigned short)cb2, + (__vector unsigned short)cb3, shift_pack_index); + cb = vec_pack(cbl, cbh); + vec_st(cb, 0, outptr1); + + /* Calculate Cr values */ + cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); + cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); + cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); + cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); + cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, + (__vector unsigned int)cr0); + cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, + (__vector unsigned int)cr1); + cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, + (__vector unsigned int)cr2); + cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, + (__vector unsigned int)cr3); + crl = vec_perm((__vector unsigned short)cr0, + (__vector unsigned short)cr1, shift_pack_index); + crh = vec_perm((__vector unsigned short)cr2, + (__vector unsigned short)cr3, shift_pack_index); + cr = vec_pack(crl, crh); + vec_st(cr, 0, outptr2); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c new file mode 100644 index 0000000000..d670dbcda3 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c @@ -0,0 +1,116 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_081 5329 /* FIX(0.08131) */ +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_168 11059 /* FIX(0.16874) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_331 21709 /* FIX(0.33126) */ +#define F_0_418 27439 /* FIX(0.41869) */ +#define F_0_500 32768 /* FIX(0.50000) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 \ + { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } +#define RGBG_INDEX1 \ + { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } +#define RGBG_INDEX2 \ + { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } +#define RGBG_INDEX3 \ + { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX \ + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 \ + { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } +#define RGBG_INDEX1 \ + { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } +#define RGBG_INDEX2 \ + { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } +#define RGBG_INDEX3 \ + { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX \ + { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX \ + { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX \ + { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c new file mode 100644 index 0000000000..a11a7e7021 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c @@ -0,0 +1,111 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> GRAYSCALE CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 \ + { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } +#define RGBG_INDEX1 \ + { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } +#define RGBG_INDEX2 \ + { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } +#define RGBG_INDEX3 \ + { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX \ + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } +#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 \ + { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } +#define RGBG_INDEX1 \ + { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } +#define RGBG_INDEX2 \ + { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } +#define RGBG_INDEX3 \ + { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX \ + { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } +#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX \ + { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX \ + { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c new file mode 100644 index 0000000000..b280cbbded --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c @@ -0,0 +1,228 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-altivec.c */ + + +void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; +#endif + + __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, + rgbg0, rgbg1, rgbg2, rgbg3, y; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = { 0 }; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = { 0 }; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh; + __vector int y0, y1, y2, y3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); +#endif + } + } +#else + /* Little endian */ + rgb0 = vec_vsx_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_vsx_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_vsx_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_vsx_ld(48, inptr); +#endif +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c new file mode 100644 index 0000000000..6e25b8db90 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c @@ -0,0 +1,159 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_altivec.h" +#include "jcsample.h" + + +void jsimd_h2v1_downsample_altivec(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + int outrow, outcol; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr, outptr; + + __vector unsigned char this0, next0, out; + __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(0, 1) }, + pw_one = { __8X(1) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr += 32, outptr += 16) { + + this0 = vec_ld(0, inptr); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + outl = vec_add(this0e, this0o); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_one); + + if (outcol > 8) { + next0 = vec_ld(16, inptr); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + outh = vec_add(next0e, next0o); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_one); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} + + +void +jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + + __vector unsigned char this0, next0, this1, next1, out; + __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, + next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(1, 2) }, + pw_two = { __8X(2) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { + + this0 = vec_ld(0, inptr0); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + out0l = vec_add(this0e, this0o); + + this1 = vec_ld(0, inptr1); + this1 = vec_perm(this1, this1, even_odd_index); + this1e = (__vector unsigned short)VEC_UNPACKHU(this1); + this1o = (__vector unsigned short)VEC_UNPACKLU(this1); + out1l = vec_add(this1e, this1o); + + outl = vec_add(out0l, out1l); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_two); + + if (outcol > 8) { + next0 = vec_ld(16, inptr0); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + out0h = vec_add(next0e, next0o); + + next1 = vec_ld(16, inptr1); + next1 = vec_perm(next1, next1, even_odd_index); + next1e = (__vector unsigned short)VEC_UNPACKHU(next1); + next1o = (__vector unsigned short)VEC_UNPACKLU(next1); + out1h = vec_add(next1e, next1o); + + outh = vec_add(out0h, out1h); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_two); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h new file mode 100644 index 0000000000..bd07fcc4ed --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, + JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int)(output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c new file mode 100644 index 0000000000..68d52bd8a2 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c @@ -0,0 +1,276 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-altivec.c */ + + +void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = out_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, + crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; + __vector int g0, g1, g2, g3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16, + inptr0 += 16, inptr1 += 16, inptr2 += 16) { + + y = vec_ld(0, inptr0); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + yl = (__vector signed short)VEC_UNPACKHU(y); + yh = (__vector signed short)VEC_UNPACKLU(y); + + cb = vec_ld(0, inptr1); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + bl = vec_add(cbl, cbl); + bh = vec_add(cbh, cbh); + bl = vec_madds(bl, pw_mf0228, pw_one); + bh = vec_madds(bh, pw_mf0228, pw_one); + bl = vec_sra(bl, (__vector unsigned short)pw_one); + bh = vec_sra(bh, (__vector unsigned short)pw_one); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, yl); + bh = vec_add(bh, yh); + + rl = vec_add(crl, crl); + rh = vec_add(crh, crh); + rl = vec_madds(rl, pw_f0402, pw_one); + rh = vec_madds(rh, pw_f0402, pw_one); + rl = vec_sra(rl, (__vector unsigned short)pw_one); + rh = vec_sra(rh, (__vector unsigned short)pw_one); + rl = vec_add(rl, crl); + rh = vec_add(rh, crh); + rl = vec_add(rl, yl); + rh = vec_add(rh, yh); + + g0w = vec_mergeh(cbl, crl); + g1w = vec_mergel(cbl, crl); + g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf); + g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf); + g2w = vec_mergeh(cbh, crh); + g3w = vec_mergel(cbh, crh); + g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf); + g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index); + gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index); + gl = vec_sub(gl, crl); + gh = vec_sub(gh, crh); + gl = vec_add(gl, yl); + gh = vec_add(gh, yh); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c new file mode 100644 index 0000000000..eb35b67176 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c @@ -0,0 +1,106 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 \ + { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } +#define RGB_INDEX1 \ + { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } +#define RGB_INDEX2 \ + { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX \ + { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 \ + { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } +#define RGB_INDEX1 \ + { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } +#define RGB_INDEX2 \ + { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX \ + { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX \ + { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX \ + { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c new file mode 100644 index 0000000000..79c577f141 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c @@ -0,0 +1,130 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 \ + { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } +#define RGB_INDEX1 \ + { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } +#define RGB_INDEX2 \ + { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX \ + { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extrgbx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extrgbx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 \ + { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } +#define RGB_INDEX1 \ + { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } +#define RGB_INDEX2 \ + { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX \ + { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extbgrx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extbgrx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX \ + { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extxbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extxbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX \ + { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extxrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extxrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c new file mode 100644 index 0000000000..40f02c33ea --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c @@ -0,0 +1,329 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-altivec.c */ + + +void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, + crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, + rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo; + __vector int g_y0, g_y1, g_y2, g_y3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, + even_index = + { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 }, + odd_index = + { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, + even_index = + { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 }, + odd_index = + { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 }; +#endif + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) { + + cb = vec_ld(0, inptr1); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + b_yl = vec_add(cbl, cbl); + b_yh = vec_add(cbh, cbh); + b_yl = vec_madds(b_yl, pw_mf0228, pw_one); + b_yh = vec_madds(b_yh, pw_mf0228, pw_one); + b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one); + b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + + r_yl = vec_add(crl, crl); + r_yh = vec_add(crh, crh); + r_yl = vec_madds(r_yl, pw_f0402, pw_one); + r_yh = vec_madds(r_yh, pw_f0402, pw_one); + r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one); + r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one); + r_yl = vec_add(r_yl, crl); + r_yh = vec_add(r_yh, crh); + + g_y0w = vec_mergeh(cbl, crl); + g_y1w = vec_mergel(cbl, crl); + g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf); + g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf); + g_y2w = vec_mergeh(cbh, crh); + g_y3w = vec_mergel(cbh, crh); + g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf); + g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1, + shift_pack_index); + g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3, + shift_pack_index); + g_yl = vec_sub(g_yl, crl); + g_yh = vec_sub(g_yh, crh); + + for (yloop = 0; yloop < 2 && num_cols > 0; yloop++, + num_cols -= RGB_PIXELSIZE * 16, + outptr += RGB_PIXELSIZE * 16, inptr0 += 16) { + + y = vec_ld(0, inptr0); + ye = (__vector signed short)vec_perm(pb_zero, y, even_index); + yo = (__vector signed short)vec_perm(pb_zero, y, odd_index); + + if (yloop == 0) { + be = vec_add(b_yl, ye); + bo = vec_add(b_yl, yo); + re = vec_add(r_yl, ye); + ro = vec_add(r_yl, yo); + ge = vec_add(g_yl, ye); + go = vec_add(g_yl, yo); + } else { + be = vec_add(b_yh, ye); + bo = vec_add(b_yh, yo); + re = vec_add(r_yh, ye); + ro = vec_add(r_yh, yo); + ge = vec_add(g_yh, ye); + go = vec_add(g_yh, yo); + } + + rl = vec_mergeh(re, ro); + rh = vec_mergel(re, ro); + gl = vec_mergeh(ge, go); + gh = vec_mergel(ge, go); + bl = vec_mergeh(be, bo); + bh = vec_mergel(be, bo); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} + + +void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW inptr, outptr; + + inptr = input_buf[0][in_row_group_ctr]; + outptr = output_buf[0]; + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; + output_buf[0] = output_buf[1]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = inptr; + output_buf[0] = outptr; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c new file mode 100644 index 0000000000..04df0cf108 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c @@ -0,0 +1,400 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_altivec.h" + + +void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0, + out; + __vector short this0e, this0o, this0l, this0h, last0l, last0h, + next0l, next0h, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, + last_index_col0 = + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + last_index = + { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }, + next_index = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }, + next_index_lastcol = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 }, +#if __BIG_ENDIAN__ + merge_pack_index = + { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; +#else + merge_pack_index = + { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; +#endif + __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + if (downsampled_width & 15) + inptr[downsampled_width] = inptr[downsampled_width - 1]; + + this0 = vec_ld(0, inptr); + p_last0 = vec_perm(this0, this0, last_index_col0); + last0 = this0; + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr += 16, outptr += 32) { + + if (downsampled_width - incol > 0) { + p_last0 = vec_perm(last0, this0, last_index); + last0 = this0; + } + + if (incol <= 16) + p_next0 = vec_perm(this0, this0, next_index_lastcol); + else { + next0 = vec_ld(16, inptr); + p_next0 = vec_perm(this0, next0, next_index); + } + + this0e = (__vector short)vec_mule(this0, pb_three); + this0o = (__vector short)vec_mulo(this0, pb_three); + this0l = vec_mergeh(this0e, this0o); + this0h = vec_mergel(this0e, this0o); + + last0l = (__vector short)VEC_UNPACKHU(p_last0); + last0h = (__vector short)VEC_UNPACKLU(p_last0); + last0l = vec_add(last0l, pw_one); + + next0l = (__vector short)VEC_UNPACKHU(p_next0); + next0h = (__vector short)VEC_UNPACKLU(p_next0); + next0l = vec_add(next0l, pw_two); + + outle = vec_add(this0l, last0l); + outlo = vec_add(this0l, next0l); + outle = vec_sr(outle, (__vector unsigned short)pw_two); + outlo = vec_sr(outlo, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr); + + if (incol > 8) { + last0h = vec_add(last0h, pw_one); + next0h = vec_add(next0h, pw_two); + + outhe = vec_add(this0h, last0h); + outho = vec_add(this0h, next0h); + outhe = vec_sr(outhe, (__vector unsigned short)pw_two); + outho = vec_sr(outho, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr); + } + + this0 = next0; + } + } +} + + +void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char this_1, this0, this1, out; + __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, + lastcolsum_1h, lastcolsum1h, + p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, + thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, + nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 }, + nextcolsum1l = { 0 }, nextcolsum1h = { 0 }, + p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, + tmpl, tmph, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, + last_index_col0 = + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + last_index = + { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, + next_index = + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }, + next_index_lastcol = + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 }, +#if __BIG_ENDIAN__ + merge_pack_index = + { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; +#else + merge_pack_index = + { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; +#endif + __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, + pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; + __vector unsigned short pw_four = { __8X(4) }; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 15) { + inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; + inptr0[downsampled_width] = inptr0[downsampled_width - 1]; + inptr1[downsampled_width] = inptr1[downsampled_width - 1]; + } + + this0 = vec_ld(0, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(0, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + thiscolsum_1l = vec_add(this0l, this_1l); + thiscolsum_1h = vec_add(this0h, this_1h); + lastcolsum_1h = thiscolsum_1h; + p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + + this1 = vec_ld(0, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + thiscolsum1l = vec_add(this0l, this1l); + thiscolsum1h = vec_add(this0h, this1h); + lastcolsum1h = thiscolsum1h; + p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, + outptr0 += 32, outptr1 += 32) { + + if (downsampled_width - incol > 0) { + p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; + } + + if (incol <= 16) { + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, + next_index_lastcol); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, + next_index_lastcol); + } else { + this0 = vec_ld(16, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(16, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + nextcolsum_1l = vec_add(this0l, this_1l); + nextcolsum_1h = vec_add(this0h, this_1h); + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); + + this1 = vec_ld(16, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + nextcolsum1l = vec_add(this0l, this1l); + nextcolsum1h = vec_add(this0h, this1h); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); + } + + /* Process the upper row */ + + tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum_1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum_1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr0); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum_1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum_1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr0); + } + + /* Process the lower row */ + + tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr1); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr1); + } + + thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; + thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; + } + } +} + + +/* These are rarely used (mainly just for decompressing YCCK images) */ + +void jsimd_h2v1_upsample_altivec(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr); + vec_st(inh, 16, outptr); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr); + vec_st(inh, 48, outptr); + } + } + } +} + + +void jsimd_h2v2_upsample_altivec(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr0); + vec_st(inl, 0, outptr1); + + vec_st(inh, 16, outptr0); + vec_st(inh, 16, outptr1); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr0); + vec_st(inl, 32, outptr1); + + vec_st(inh, 48, outptr0); + vec_st(inh, 48, outptr1); + } + } + } +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c new file mode 100644 index 0000000000..ad9af81e0c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c @@ -0,0 +1,154 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_0_382 98 /* FIX(0.382683433) */ +#define F_0_541 139 /* FIX(0.541196100) */ +#define F_0_707 181 /* FIX(0.707106781) */ +#define F_1_306 334 /* FIX(1.306562965) */ + +#define CONST_BITS 8 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_FDCT() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out4 = vec_sub(tmp10, tmp11); \ + \ + z1 = vec_add(tmp12, tmp13); \ + z1 = vec_sl(z1, pre_multiply_scale_bits); \ + z1 = vec_madds(z1, pw_0707, pw_zero); \ + \ + out2 = vec_add(tmp13, z1); \ + out6 = vec_sub(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = vec_add(tmp4, tmp5); \ + tmp11 = vec_add(tmp5, tmp6); \ + tmp12 = vec_add(tmp6, tmp7); \ + \ + tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + z5 = vec_sub(tmp10, tmp12); \ + z5 = vec_madds(z5, pw_0382, pw_zero); \ + \ + z2 = vec_madds(tmp10, pw_0541, z5); \ + z4 = vec_madds(tmp12, pw_1306, z5); \ + \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + z3 = vec_madds(tmp11, pw_0707, pw_zero); \ + \ + z11 = vec_add(tmp7, z3); \ + z13 = vec_sub(tmp7, z3); \ + \ + out5 = vec_add(z13, z2); \ + out3 = vec_sub(z13, z2); \ + out1 = vec_add(z11, z4); \ + out7 = vec_sub(z11, z4); \ +} + + +void jsimd_fdct_ifast_altivec(DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z1, z2, z3, z4, z5, z11, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, + pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, + pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, + pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c new file mode 100644 index 0000000000..3d4f017103 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c @@ -0,0 +1,258 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER FORWARD DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + + +#define DO_FDCT_COMMON(PASS) { \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * data2 = z1 + tmp13 * 0.765366865; \ + * data6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \ + out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \ + out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \ + out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ + * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * data7 = tmp4 + z3; data5 = tmp5 + z4; \ + * data3 = tmp6 + z3; data1 = tmp7 + z4; \ + */ \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \ + out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \ + out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \ + out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \ + \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \ + out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \ + out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \ + out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \ + \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(1); \ +} + +#define DO_FDCT_PASS2() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, pw_descale_p2x); \ + out0 = vec_sra(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, pw_descale_p2x); \ + out4 = vec_sra(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(2); \ +} + + +void jsimd_fdct_islow_altivec(DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + /* Constants */ + __vector short + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }, + pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_PASS1(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_PASS2(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c new file mode 100644 index 0000000000..456c6c6174 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c @@ -0,0 +1,255 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER INVERSE DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_1_082 277 /* FIX(1.082392200) */ +#define F_1_414 362 /* FIX(1.414213562) */ +#define F_1_847 473 /* FIX(1.847759065) */ +#define F_2_613 669 /* FIX(2.613125930) */ +#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ + +#define CONST_BITS 8 +#define PASS1_BITS 2 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_IDCT(in) { \ + /* Even part */ \ + \ + tmp10 = vec_add(in##0, in##4); \ + tmp11 = vec_sub(in##0, in##4); \ + tmp13 = vec_add(in##2, in##6); \ + \ + tmp12 = vec_sub(in##2, in##6); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ + tmp12 = vec_sub(tmp12, tmp13); \ + \ + tmp0 = vec_add(tmp10, tmp13); \ + tmp3 = vec_sub(tmp10, tmp13); \ + tmp1 = vec_add(tmp11, tmp12); \ + tmp2 = vec_sub(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + z13 = vec_add(in##5, in##3); \ + z10 = vec_sub(in##5, in##3); \ + z10s = vec_sl(z10, pre_multiply_scale_bits); \ + z11 = vec_add(in##1, in##7); \ + z12s = vec_sub(in##1, in##7); \ + z12s = vec_sl(z12s, pre_multiply_scale_bits); \ + \ + tmp11 = vec_sub(z11, z13); \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ + \ + tmp7 = vec_add(z11, z13); \ + \ + /* To avoid overflow... \ + * \ + * (Original) \ + * tmp12 = -2.613125930 * z10 + z5; \ + * \ + * (This implementation) \ + * tmp12 = (-1.613125930 - 1) * z10 + z5; \ + * = -1.613125930 * z10 - z10 + z5; \ + */ \ + \ + z5 = vec_add(z10s, z12s); \ + z5 = vec_madds(z5, pw_F1847, pw_zero); \ + \ + tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ + tmp10 = vec_sub(tmp10, z5); \ + tmp12 = vec_madds(z10s, pw_MF1613, z5); \ + tmp12 = vec_sub(tmp12, z10); \ + \ + tmp6 = vec_sub(tmp12, tmp7); \ + tmp5 = vec_sub(tmp11, tmp6); \ + tmp4 = vec_add(tmp10, tmp5); \ + \ + out0 = vec_add(tmp0, tmp7); \ + out1 = vec_add(tmp1, tmp6); \ + out2 = vec_add(tmp2, tmp5); \ + out3 = vec_sub(tmp3, tmp4); \ + out4 = vec_add(tmp3, tmp4); \ + out5 = vec_sub(tmp2, tmp5); \ + out6 = vec_sub(tmp1, tmp6); \ + out7 = vec_sub(tmp0, tmp7); \ +} + + +void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z5, z10, z10s, z11, z12s, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, + pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, + pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, + pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, + pass1_bits3 = { __8X(PASS1_BITS + 3) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row); + + out0 = vec_sra(out0, pass1_bits3); + out1 = vec_sra(out1, pass1_bits3); + out2 = vec_sra(out2, pass1_bits3); + out3 = vec_sra(out3, pass1_bits3); + out4 = vec_sra(out4, pass1_bits3); + out5 = vec_sra(out5, pass1_bits3); + out6 = vec_sra(out6, pass1_bits3); + out7 = vec_sra(out7, pass1_bits3); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c new file mode 100644 index 0000000000..60e619f11d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c @@ -0,0 +1,357 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER INVERSE DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + + +#define DO_IDCT(in, PASS) { \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + in##26l = vec_mergeh(in##2, in##6); \ + in##26h = vec_mergel(in##2, in##6); \ + \ + tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \ + tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \ + tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \ + tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \ + \ + tmp0 = vec_add(in##0, in##4); \ + tmp1 = vec_sub(in##0, in##4); \ + \ + tmp0l = vec_unpackh(tmp0); \ + tmp0h = vec_unpackl(tmp0); \ + tmp0l = vec_sl(tmp0l, const_bits); \ + tmp0h = vec_sl(tmp0h, const_bits); \ + tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \ + tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \ + \ + tmp10l = vec_add(tmp0l, tmp3l); \ + tmp10h = vec_add(tmp0h, tmp3h); \ + tmp13l = vec_sub(tmp0l, tmp3l); \ + tmp13h = vec_sub(tmp0h, tmp3h); \ + \ + tmp1l = vec_unpackh(tmp1); \ + tmp1h = vec_unpackl(tmp1); \ + tmp1l = vec_sl(tmp1l, const_bits); \ + tmp1h = vec_sl(tmp1h, const_bits); \ + tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \ + tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \ + \ + tmp11l = vec_add(tmp1l, tmp2l); \ + tmp11h = vec_add(tmp1h, tmp2h); \ + tmp12l = vec_sub(tmp1l, tmp2l); \ + tmp12h = vec_sub(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(in##3, in##7); \ + z4 = vec_add(in##1, in##5); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + in##71l = vec_mergeh(in##7, in##1); \ + in##71h = vec_mergel(in##7, in##1); \ + \ + tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \ + tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \ + tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \ + tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \ + \ + in##53l = vec_mergeh(in##5, in##3); \ + in##53h = vec_mergel(in##5, in##3); \ + \ + tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \ + tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \ + tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \ + tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = vec_add(tmp10l, tmp3l); \ + out0h = vec_add(tmp10h, tmp3h); \ + out7l = vec_sub(tmp10l, tmp3l); \ + out7h = vec_sub(tmp10h, tmp3h); \ + \ + out0l = vec_sra(out0l, descale_p##PASS); \ + out0h = vec_sra(out0h, descale_p##PASS); \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + \ + out0 = vec_pack(out0l, out0h); \ + out7 = vec_pack(out7l, out7h); \ + \ + out1l = vec_add(tmp11l, tmp2l); \ + out1h = vec_add(tmp11h, tmp2h); \ + out6l = vec_sub(tmp11l, tmp2l); \ + out6h = vec_sub(tmp11h, tmp2h); \ + \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out1 = vec_pack(out1l, out1h); \ + out6 = vec_pack(out6l, out6h); \ + \ + out2l = vec_add(tmp12l, tmp1l); \ + out2h = vec_add(tmp12h, tmp1h); \ + out5l = vec_sub(tmp12l, tmp1l); \ + out5h = vec_sub(tmp12h, tmp1h); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out5 = vec_pack(out5l, out5h); \ + \ + out3l = vec_add(tmp13l, tmp0l); \ + out3h = vec_add(tmp13h, tmp0h); \ + out4l = vec_sub(tmp13l, tmp0l); \ + out4h = vec_sub(tmp13h, tmp0h); \ + \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + out4l = vec_sra(out4l, descale_p##PASS); \ + out4h = vec_sra(out4h, descale_p##PASS); \ + \ + out3 = vec_pack(out3l, out3h); \ + out4 = vec_pack(out4l, out4h); \ +} + + +void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, z3, z4, + z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, + row71l, row71h, row26l, row26h, row53l, row53h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, + tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, + z3l, z3h, z4l, z4h, + out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, + out5l, out5h, out6l, out6h, out7l, out7h; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_zero = { __4X(0) }, + pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }, + const_bits = { __4X(CONST_BITS) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + col0 = vec_sl(col0, pass1_bits); + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col, 1); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row, 2); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c new file mode 100644 index 0000000000..7d6e32542b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c @@ -0,0 +1,250 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_altivec.h" + + +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can + * always get the data we want by using a single vector load (although we may + * have to permute the result.) + */ +#if __BIG_ENDIAN__ + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_ld(0, elemptr); \ + if ((size_t)elemptr & 15) \ + in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ +} + +#else + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_vsx_ld(0, elemptr); \ +} + +#endif + + +void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + JSAMPROW elemptr; + + __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; + __vector short out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; + __vector unsigned char pb_zero = { __16X(0) }; + + LOAD_ROW(0); + LOAD_ROW(1); + LOAD_ROW(2); + LOAD_ROW(3); + LOAD_ROW(4); + LOAD_ROW(5); + LOAD_ROW(6); + LOAD_ROW(7); + + out0 = (__vector short)VEC_UNPACKHU(in0); + out1 = (__vector short)VEC_UNPACKHU(in1); + out2 = (__vector short)VEC_UNPACKHU(in2); + out3 = (__vector short)VEC_UNPACKHU(in3); + out4 = (__vector short)VEC_UNPACKHU(in4); + out5 = (__vector short)VEC_UNPACKHU(in5); + out6 = (__vector short)VEC_UNPACKHU(in6); + out7 = (__vector short)VEC_UNPACKHU(in7); + + out0 = vec_sub(out0, pw_centerjsamp); + out1 = vec_sub(out1, pw_centerjsamp); + out2 = vec_sub(out2, pw_centerjsamp); + out3 = vec_sub(out3, pw_centerjsamp); + out4 = vec_sub(out4, pw_centerjsamp); + out5 = vec_sub(out5, pw_centerjsamp); + out6 = vec_sub(out6, pw_centerjsamp); + out7 = vec_sub(out7, pw_centerjsamp); + + vec_st(out0, 0, workspace); + vec_st(out1, 16, workspace); + vec_st(out2, 32, workspace); + vec_st(out3, 48, workspace); + vec_st(out4, 64, workspace); + vec_st(out5, 80, workspace); + vec_st(out6, 96, workspace); + vec_st(out7, 112, workspace); +} + + +#define WORD_BIT 16 + +/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. + We basically need an unsigned equivalent of vec_madds(). */ + +#define MULTIPLY(vs0, vs1, out) { \ + tmpe = vec_mule((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + tmpo = vec_mulo((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ + (__vector unsigned short)tmpo, \ + shift_pack_index); \ +} + +void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, + corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, + recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, + scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; + __vector unsigned int tmpe, tmpo; + + /* Constants */ + __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; +#if __BIG_ENDIAN__ + __vector unsigned char shift_pack_index = + { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 }; +#else + __vector unsigned char shift_pack_index = + { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; +#endif + + row0 = vec_ld(0, workspace); + row1 = vec_ld(16, workspace); + row2 = vec_ld(32, workspace); + row3 = vec_ld(48, workspace); + row4 = vec_ld(64, workspace); + row5 = vec_ld(80, workspace); + row6 = vec_ld(96, workspace); + row7 = vec_ld(112, workspace); + + /* Branch-less absolute value */ + row0s = vec_sra(row0, pw_word_bit_m1); + row1s = vec_sra(row1, pw_word_bit_m1); + row2s = vec_sra(row2, pw_word_bit_m1); + row3s = vec_sra(row3, pw_word_bit_m1); + row4s = vec_sra(row4, pw_word_bit_m1); + row5s = vec_sra(row5, pw_word_bit_m1); + row6s = vec_sra(row6, pw_word_bit_m1); + row7s = vec_sra(row7, pw_word_bit_m1); + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + corr0 = vec_ld(DCTSIZE2 * 2, divisors); + corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); + corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); + corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); + corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); + corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); + corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); + corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); + + row0 = vec_add(row0, corr0); + row1 = vec_add(row1, corr1); + row2 = vec_add(row2, corr2); + row3 = vec_add(row3, corr3); + row4 = vec_add(row4, corr4); + row5 = vec_add(row5, corr5); + row6 = vec_add(row6, corr6); + row7 = vec_add(row7, corr7); + + recip0 = vec_ld(0, divisors); + recip1 = vec_ld(16, divisors); + recip2 = vec_ld(32, divisors); + recip3 = vec_ld(48, divisors); + recip4 = vec_ld(64, divisors); + recip5 = vec_ld(80, divisors); + recip6 = vec_ld(96, divisors); + recip7 = vec_ld(112, divisors); + + MULTIPLY(row0, recip0, row0); + MULTIPLY(row1, recip1, row1); + MULTIPLY(row2, recip2, row2); + MULTIPLY(row3, recip3, row3); + MULTIPLY(row4, recip4, row4); + MULTIPLY(row5, recip5, row5); + MULTIPLY(row6, recip6, row6); + MULTIPLY(row7, recip7, row7); + + scale0 = vec_ld(DCTSIZE2 * 4, divisors); + scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); + scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); + scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); + scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); + scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); + scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); + scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); + + MULTIPLY(row0, scale0, row0); + MULTIPLY(row1, scale1, row1); + MULTIPLY(row2, scale2, row2); + MULTIPLY(row3, scale3, row3); + MULTIPLY(row4, scale4, row4); + MULTIPLY(row5, scale5, row5); + MULTIPLY(row6, scale6, row6); + MULTIPLY(row7, scale7, row7); + + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + vec_st(row0, 0, coef_block); + vec_st(row1, 16, coef_block); + vec_st(row2, 32, coef_block); + vec_st(row3, 48, coef_block); + vec_st(row4, 64, coef_block); + vec_st(row5, 80, coef_block); + vec_st(row6, 96, coef_block); + vec_st(row7, 112, coef_block); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c new file mode 100644 index 0000000000..b9e86dcfac --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c @@ -0,0 +1,881 @@ +/* + * jsimd_powerpc.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * PowerPC architecture. + */ + +#ifdef __amigaos4__ +/* This must be defined first as it re-defines GLOBAL otherwise */ +#include +#endif + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include +#include +#include + +#if defined(__OpenBSD__) +#include +#include +#include +#elif defined(__FreeBSD__) +#include +#include +#endif + +static unsigned int simd_support = ~0; + +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature(char *buffer, char *feature) +{ + char *p; + + if (*feature == 0) + return 0; + if (strncmp(buffer, "cpu", 3) != 0) + return 0; + buffer += 3; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "altivec")) + simd_support |= JSIMD_ALTIVEC; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#elif defined(__amigaos4__) + uint32 altivec = 0; +#elif defined(__OpenBSD__) + int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; + int altivec; + size_t len = sizeof(altivec); +#elif defined(__FreeBSD__) + unsigned long cpufeatures = 0; +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ALTIVEC__) || defined(__APPLE__) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#elif defined(__amigaos4__) + IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE); + if (altivec == VECTORTYPE_ALTIVEC) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__OpenBSD__) + if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__FreeBSD__) + elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); + if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC) + simd_support |= JSIMD_ALTIVEC; +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEALTIVEC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ALTIVEC; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_extrgb_ycc_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_extrgbx_ycc_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_extbgr_ycc_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_extbgrx_ycc_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_extxbgr_ycc_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_extxrgb_ycc_convert_altivec; + break; + default: + altivecfct = jsimd_rgb_ycc_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_extrgb_gray_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_extrgbx_gray_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_extbgr_gray_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_extbgrx_gray_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_extxbgr_gray_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_extxrgb_gray_convert_altivec; + break; + default: + altivecfct = jsimd_rgb_gray_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_ycc_extrgb_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_ycc_extrgbx_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_ycc_extbgr_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_ycc_extbgrx_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_ycc_extxbgr_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_ycc_extxrgb_convert_altivec; + break; + default: + altivecfct = jsimd_ycc_rgb_convert_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct = jsimd_h2v2_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct = jsimd_h2v1_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_altivec(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_altivec(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h new file mode 100644 index 0000000000..e8bdb06a54 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h @@ -0,0 +1,98 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include + + +/* Common code */ + +#define __4X(a) a, a, a, a +#define __4X2(a, b) a, b, a, b, a, b, a, b +#define __8X(a) __4X(a), __4X(a) +#define __16X(a) __8X(a), __8X(a) + +#define TRANSPOSE(row, col) { \ + __vector short row04l, row04h, row15l, row15h, \ + row26l, row26h, row37l, row37h; \ + __vector short col01e, col01o, col23e, col23o, \ + col45e, col45o, col67e, col67o; \ + \ + /* transpose coefficients (phase 1) */ \ + row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ + row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ + row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ + row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ + row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ + row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ + row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ + row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ + \ + /* transpose coefficients (phase 2) */ \ + col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ + col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ + col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ + col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ + col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ + col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ + col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ + col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ + \ + /* transpose coefficients (phase 3) */ \ + col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ + col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ + col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ + col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ + col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ + col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ + col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ + col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ +} + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + + +/* Macros to abstract big/little endian bit twiddling */ + +#if __BIG_ENDIAN__ + +#define VEC_LD(a, b) vec_ld(a, b) +#define VEC_ST(a, b, c) vec_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) +#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) + +#else + +#define VEC_LD(a, b) vec_vsx_ld(a, b) +#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) +#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) + +#endif diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm new file mode 100644 index 0000000000..ffb527db00 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm @@ -0,0 +1,559 @@ +; +; jccolext.asm - colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 8 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) + +EXTN(jsimd_rgb_ycc_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr0 + mov rbxp, JSAMPROW [rbx] ; outptr1 + mov rdxp, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + vmovd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub rcx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [rsi+rcx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub rcx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub rcx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE + vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO + vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE + vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vmovdqa ymm7, ymm1 + vmovdqa ymm4, ymm6 + vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + vpxor ymm1, ymm1, ymm1 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL + vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH + vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) + + vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm7, ymm7, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH + vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO + + vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vmovdqa ymm5, ymm0 + vmovdqa ymm4, ymm6 + vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + vpxor ymm0, ymm0, ymm0 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL + vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH + vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) + + vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm5, ymm5, ymm0 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm4, ymm4, ymm1 + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH + vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm5, ymm5, ymm7 ; ymm5=Cb + vmovdqu YMMWORD [rbx], ymm5 ; Save Cb + + vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO + vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE + vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vmovdqa ymm7, ymm0 + vmovdqa ymm5, ymm4 + vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, YMMWORD [wk(4)] + vpaddd ymm4, ymm4, YMMWORD [wk(5)] + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vpxor ymm3, ymm3, ymm3 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL + vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH + vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) + + vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm3 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm5, ymm5, ymm1 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH + vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO + + vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vmovdqa ymm1, ymm6 + vmovdqa ymm5, ymm4 + vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(6)] + vpaddd ymm4, ymm4, YMMWORD [wk(7)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [rdi], ymm6 ; Save Y + + vpxor ymm2, ymm2, ymm2 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL + vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH + vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) + + vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ] + + vpaddd ymm1, ymm1, ymm2 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm1, ymm1, ymm0 + vpaddd ymm5, ymm5, ymm0 + vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH + vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Cr + vmovdqu YMMWORD [rdx], ymm1 ; Save Cr + + sub rcx, byte SIZEOF_YMMWORD + add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add rdi, byte SIZEOF_YMMWORD ; outptr0 + add rbx, byte SIZEOF_YMMWORD ; outptr1 + add rdx, byte SIZEOF_YMMWORD ; outptr2 + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm new file mode 100644 index 0000000000..af70ed6010 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm @@ -0,0 +1,484 @@ +; +; jccolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr0 + mov rbxp, JSAMPROW [rbx] ; outptr1 + mov rdxp, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm new file mode 100644 index 0000000000..16b78298dc --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) + +EXTN(jconst_rgb_ycc_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm new file mode 100644 index 0000000000..e2955c2134 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm @@ -0,0 +1,120 @@ +; +; jccolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm new file mode 100644 index 0000000000..591255bb11 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_avx2) + +EXTN(jconst_rgb_gray_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm new file mode 100644 index 0000000000..e389904f2f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm @@ -0,0 +1,112 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm new file mode 100644 index 0000000000..ddcc2c0a2f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm @@ -0,0 +1,438 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) + +EXTN(jsimd_rgb_gray_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + vmovd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub rcx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [rsi+rcx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub rcx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub rcx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa ymm0, ymm5 ; ymm0=BO + vmovdqa ymm6, ymm4 ; ymm6=BE + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm4, ymm4, ymm7 + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(0)] + vpaddd ymm4, ymm4, YMMWORD [wk(1)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [rdi], ymm6 ; Save Y + + sub rcx, byte SIZEOF_YMMWORD + add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add rdi, byte SIZEOF_YMMWORD ; outptr0 + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm new file mode 100644 index 0000000000..f1d399a63b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm @@ -0,0 +1,363 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm new file mode 100644 index 0000000000..9ea6df946e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm @@ -0,0 +1,583 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) +; +; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based on jchuff.c; see jchuff.c for more details. + +%include "jsimdext.inc" + +struc working_state +.next_output_byte: resp 1 ; => next byte to write in buffer +.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer +.cur.put_buffer.simd resq 1 ; current bit accumulation buffer +.cur.free_bits resd 1 ; # of bits available in it +.cur.last_dc_val resd 4 ; last DC coef for each component +.cinfo: resp 1 ; dump_buffer needs access to this +endstruc + +struc c_derived_tbl +.ehufco: resd 256 ; code for each symbol +.ehufsi: resb 256 ; length of code for each symbol +; If no code has been allocated for a symbol S, ehufsi[S] contains 0 +endstruc + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + +jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 + dd 0x000f, 0x001f, 0x003f, 0x007f + dd 0x00ff, 0x01ff, 0x03ff, 0x07ff + dd 0x0fff, 0x1fff, 0x3fff, 0x7fff + + alignz 32 + +times 1 << 14 db 15 +times 1 << 13 db 14 +times 1 << 12 db 13 +times 1 << 11 db 12 +times 1 << 10 db 11 +times 1 << 9 db 10 +times 1 << 8 db 9 +times 1 << 7 db 8 +times 1 << 6 db 7 +times 1 << 5 db 6 +times 1 << 4 db 5 +times 1 << 3 db 4 +times 1 << 2 db 3 +times 1 << 1 db 2 +times 1 << 0 db 1 +times 1 db 0 +jpeg_nbits_table: +times 1 db 0 +times 1 << 0 db 1 +times 1 << 1 db 2 +times 1 << 2 db 3 +times 1 << 3 db 4 +times 1 << 4 db 5 +times 1 << 5 db 6 +times 1 << 6 db 7 +times 1 << 7 db 8 +times 1 << 8 db 9 +times 1 << 9 db 10 +times 1 << 10 db 11 +times 1 << 11 db 12 +times 1 << 12 db 13 +times 1 << 13 db 14 +times 1 << 14 db 15 +times 1 << 15 db 16 + + alignz 32 + +%define NBITS(x) nbits_base + x +%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; Shorthand used to describe SIMD operations: +; wN: xmmN treated as eight signed 16-bit values +; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 +; bN: xmmN treated as 16 unsigned 8-bit values +; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 +; Contents of SIMD registers are shown in memory order. + +; Fill the bit buffer to capacity with the leading bits from code, then output +; the bit buffer and put the remaining bits from code into the bit buffer. +; +; Usage: +; code - contains the bits to shift into the bit buffer (LSB-aligned) +; %1 - the label to which to jump when the macro completes +; %2 (optional) - extra instructions to execute after nbits has been set +; +; Upon completion, free_bits will be set to the number of remaining bits from +; code, and put_buffer will contain those remaining bits. temp and code will +; be clobbered. +; +; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() +; macro in jchuff.c. + +%macro EMIT_QWORD 1-2 + add nbitsb, free_bitsb ; nbits += free_bits; + neg free_bitsb ; free_bits = -free_bits; + mov tempd, code ; temp = code; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + mov nbitsb, free_bitsb ; nbits = free_bits; + neg free_bitsb ; free_bits = -free_bits; + shr tempd, nbitsb ; temp >>= nbits; + or tempq, put_buffer ; temp |= put_buffer; + movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; + bswap tempq ; temp = htonl(temp); + mov put_buffer, codeq ; put_buffer = code; + pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); + %2 + pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); + mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); + ; (speculative; will be overwritten if + ; code contains any 0xFF bytes) + add free_bitsb, 64 ; free_bits += 64; + add bufferp, 8 ; buffer += 8; + test code, code ; if (code == 0) /* No 0xFF bytes */ + jz %1 ; return; + ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 + ; bytes in the qword. + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer-7], 0 ; buffer[-7] = 0; + sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempq, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempq, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + shr tempd, 16 ; temp >>= 16; + mov byte [buffer], tempb ; buffer[0] = temp[0]; + cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); + mov byte [buffer], temph ; buffer[0] = temp[1]; + cmp temph, 0xFF ; Set CF if temp[1] < 0xFF + mov byte [buffer+1], 0 ; buffer[1] = 0; + sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); + jmp %1 ; return; +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET *) +; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; +; NOTES: +; When shuffling data, we try to avoid pinsrw as much as possible, since it is +; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on +; modern CPUs, so chains of pinsrw instructions (even with different outputs) +; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and +; requires 2 µops (with memory operand) on Intel. In either case, only one +; pinsrw instruction can be decoded per cycle (and nothing else if they are +; back-to-back), so out-of-order execution cannot be used to work around long +; pinsrw chains (though for Sandy Bridge and later, this may be less of a +; problem if the code runs from the µop cache.) +; +; We use tzcnt instead of bsf without checking for support. The instruction is +; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to +; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is +; an input dependency (although the behavior is not formally defined, Intel +; CPUs usually leave the destination unmodified if the source is zero.) This +; can prevent out-of-order execution, so we clear the destination before +; invoking tzcnt. +; +; Initial register allocation +; rax - buffer +; rbx - temp +; rcx - nbits +; rdx - block --> free_bits +; rsi - nbits_base +; rdi - t +; rbp - code +; r8 - dctbl --> code_temp +; r9 - actbl +; r10 - state +; r11 - index +; r12 - put_buffer + +%define buffer rax +%ifdef WIN64 +%define bufferp rax +%else +%define bufferp raxp +%endif +%define tempq rbx +%define tempd ebx +%define tempb bl +%define temph bh +%define nbitsq rcx +%define nbits ecx +%define nbitsb cl +%define block rdx +%define nbits_base rsi +%define t rdi +%define td edi +%define codeq rbp +%define code ebp +%define dctbl r8 +%define actbl r9 +%define state r10 +%define index r11 +%define indexd r11d +%define put_buffer r12 +%define put_bufferd r12d + +; Step 1: Re-arrange input data according to jpeg_natural_order +; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 +; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 +; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 +; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 +; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 +; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 +; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 +; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 + + align 32 + GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + +%ifdef WIN64 + +; rcx = working_state *state +; rdx = JOCTET *buffer +; r8 = JCOEFPTR block +; r9 = int last_dc_val +; [rax+48] = c_derived_tbl *dctbl +; [rax+56] = c_derived_tbl *actbl + + ;X: X = code stream + mov buffer, rdx + mov block, r8 + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 + push rbx + push rbp + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + push rsi + push rdi + push r12 + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + mov state, rcx + movsx code, word [block] ;Z: code = block[0]; + pxor xmm4, xmm4 ;A: w4[i] = 0; + sub code, r9d ;Z: code -= last_dc_val; + mov dctbl, POINTER [rsp+6*8+4*8] + mov actbl, POINTER [rsp+6*8+5*8] + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + lea nbits_base, [rel jpeg_nbits_table] + add rsp, -DCTSIZE2 * SIZEOF_WORD + mov t, rsp + +%else + +; rdi = working_state *state +; rsi = JOCTET *buffer +; rdx = JCOEFPTR block +; rcx = int last_dc_val +; r8 = c_derived_tbl *dctbl +; r9 = c_derived_tbl *actbl + + ;X: X = code stream + movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 + push rbx + push rbp + movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 + push r12 + mov state, rdi + mov buffer, rsi + movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 + movsx codeq, word [block] ;Z: code = block[0]; + lea nbits_base, [rel jpeg_nbits_table] + pxor xmm4, xmm4 ;A: w4[i] = 0; + sub codeq, rcx ;Z: code -= last_dc_val; + punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 + lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ + +%endif + + pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 + pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 + punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 + punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 + pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 + ;A: (Row 0, offset 1) + pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); + paddw xmm0, xmm4 ;A: w0[i] += w4[i]; + movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; + + movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- + pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- + pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 + movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 + movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 + punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 + pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 + pxor xmm4, xmm4 ;A: w4[i] = 0; + psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- + pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); + pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 + ; (Row 1, offset 1) + pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); + paddw xmm1, xmm4 ;B: w1[i] += w4[i]; + movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; + pxor xmm4, xmm4 ;B: w4[i] = 0; + pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); + + packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] + ; w/ signed saturation + + pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- + pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- + pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- + pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 + ; (Row 3, offset 1) + pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); + paddw xmm3, xmm4 ;D: w3[i] += w4[i]; + movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; + pxor xmm4, xmm4 ;D: w4[i] = 0; + pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); + + pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 + cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000, + ;Z: i.e. if code is positive + pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 + pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 + adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0); + pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 + pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 + movsxd codeq, code ;Z: sign extend code + pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 + ; (Row 2, offset 1) + pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); + paddw xmm2, xmm4 ;C: w2[i] += w4[i]; + movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; + pxor xmm4, xmm4 ;C: w4[i] = 0; + pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); + + packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] + ; w/ signed saturation + + movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); + movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 + pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); + pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); + movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 + punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 + shl tempd, 16 ;Z: temp <<= 16; + psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- + pxor xmm2, xmm2 ;H: w2[i] = 0; + or put_bufferd, tempd ;Z: put_buffer |= temp; + pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- + movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- + unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 + pxor xmm0, xmm0 ;H: w0[i] = 0; + pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- + ; (Row 7, offset 1) + pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); + paddw xmm3, xmm2 ;H: w3[i] += w2[i]; + movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; + movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- + pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); + punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 + mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] + ;Z: temp = dctbl->ehufco[nbits]; + movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 + psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- + shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 + and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; + pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- + pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 + shl tempq, nbitsb ;Z: temp <<= nbits; + pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- + pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 + pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- + or code, tempd ;Z: code |= temp; + movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 + pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- + pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 + pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 + ; (Row 6, offset 1) + pxor xmm2, xmm2 ;G: w2[i] = 0; + pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); + pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 + paddw xmm4, xmm0 ;G: w4[i] += w0[i]; + movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; + pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 + ; (Row 5, offset 1) + pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); + pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 + + packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] + ; w/ signed saturation + + pxor xmm0, xmm0 ;F: w0[i] = 0; + pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 + pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); + pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); + pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 + paddw xmm1, xmm2 ;F: w1[i] += w2[i]; + movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; + pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 + ; (Row 4, offset 1) +%undef block +%define free_bitsq rdx +%define free_bitsd edx +%define free_bitsb dl + pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); + shl tempq, 48 ;Z: temp <<= 48; + pxor xmm2, xmm2 ;E: w2[i] = 0; + pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); + paddw xmm5, xmm0 ;E: w5[i] += w0[i]; + or tempq, put_buffer ;Z: temp |= put_buffer; + movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; + lea t, [dword t - 2] ;Z: t = &t[-1]; + pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); + + packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] + ; w/ signed saturation + + add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq] + ;Z: nbits += dctbl->ehufsi[nbits]; +%undef dctbl +%define code_temp r8d + pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); + mov free_bitsd, [state+working_state.cur.free_bits] + ;Z: free_bits = state->cur.free_bits; + pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; + shl index, 32 ;Z: index <<= 32; + mov put_buffer, [state+working_state.cur.put_buffer.simd] + ;Z: put_buffer = state->cur.put_buffer.simd; + or index, tempq ;Z: index |= temp; + not index ;Z: index = ~index; + sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) + jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; + align 16 +.EMIT_CODE: ;Z: .EMIT_CODE: + EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.BRLOOP: ; do { + lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] + ; nbits = actbl->ehufsi[0xf0]; + mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] + ; code = actbl->ehufco[0xf0]; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + mov nbits, code_temp ; nbits = code_temp; + or put_buffer, codeq ; put_buffer |= code; + cmp nbits, 16 ; if (nbits <= 16) + jle .ERLOOP ; break; + jmp .BRLOOP ; } while (1); + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 + times 5 nop +.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; +.BLOOP_COND: ; .BLOOP_COND: + test index, index ; if (index != 0) + jz .ELOOP ; { +.BLOOP: ; do { + xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ + tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index + inc nbits ; ++nbits; + lea t, [t + nbitsq * 2] ; t = &t[nbits]; + shr index, nbitsb ; index >>= nbits; +.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: + cmp nbits, 16 ; if (nbits > 16) + jg .BRLOOP ; goto .BRLOOP; +.ERLOOP: ; .ERLOOP: + movsx codeq, word [t] ; code = *t; + lea tempd, [nbitsq * 2] ; temp = nbits * 2; + movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); + lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; + mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] + ; code_temp = actbl->ehufco[temp-16]; + shl code_temp, nbitsb ; code_temp <<= nbits; + and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; + add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] + ; free_bits -= actbl->ehufsi[temp-16]; + or code, code_temp ; code |= code_temp; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jle .EMIT_CODE ; goto .EMIT_CODE; + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; + test index, index + jnz .BLOOP ; } while (index != 0); +.ELOOP: ; } /* index != 0 */ + sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); +%ifdef WIN64 + cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) +%else + cmp td, -2 * SIZEOF_WORD ; if (t != -2) +%endif + je .EFN ; { + movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] + ; nbits = actbl->ehufsi[0]; + mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; + sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) + jg .EFN_SKIP_EMIT_CODE ; { + EMIT_QWORD .EFN ; insert code, flush buffer + align 16 +.EFN_SKIP_EMIT_CODE: ; } else { + shl put_buffer, nbitsb ; put_buffer <<= nbits; + or put_buffer, codeq ; put_buffer |= code; +.EFN: ; } } + mov [state + working_state.cur.put_buffer.simd], put_buffer + ; state->cur.put_buffer.simd = put_buffer; + mov byte [state + working_state.cur.free_bits], free_bitsb + ; state->cur.free_bits = free_bits; +%ifdef WIN64 + sub rsp, -DCTSIZE2 * SIZEOF_WORD + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx +%else + pop r12 + pop rbp + pop rbx +%endif + ret + +; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + align 16 +.EMIT_BRLOOP_CODE: + EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp } + ; insert code, flush buffer, + ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm new file mode 100644 index 0000000000..01b5c0235f --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm @@ -0,0 +1,639 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding +; (64-bit SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + mov T1d, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + movdqa xmm7, XMMWORD [VALUES + (56*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, ZERO + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl rcx, 16 + shl rdx, 32 + shl rsi, 48 + + or rax, rcx + or rdx, rsi + or rax, rdx + + not rax + + mov MMWORD [r15], rax +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *zerobits + +%define ZERO xmm9 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + movd AL, r13d + pxor ZERO, ZERO + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 + test LEN, 15 + je .PADDING +.ELOOP16: + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *bits + +%define ZERO xmm9 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define KK r9d +%define EOB r8d +%define SIGN rdi +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + xor SIGN, SIGN + xor EOB, EOB + xor KK, KK + movd AL, r13d + pxor ZERO, ZERO + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 16 + dec K + jnz .BLOOPR16 + test LEN, 15 + je .PADDINGR +.ELOOPR16: + test LEN, 8 + jz .TRYR7 + test LEN, 7 + jz .TRYR8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + shr SIGN, 8 + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + not SIGN + sub VALUES, DCTSIZE2*2 + mov MMWORD [r15+SIZEOF_MMWORD], SIGN + + REDUCE0 + + mov eax, EOB + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm new file mode 100644 index 0000000000..b32527aebe --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm @@ -0,0 +1,367 @@ +; +; jcsample.asm - downsampling (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) + +EXTN(jsimd_h2v1_downsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdip, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax, eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + vmovd xmm7, edx + vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + +.columnloop_r24: + ; rcx can possibly be 8, 16, 24 + cmp rcx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp rcx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] + +.downsample: + vpsrlw ymm2, ymm0, BYTE_BIT + vpand ymm0, ymm0, ymm6 + vpsrlw ymm3, ymm1, BYTE_BIT + vpand ymm1, ymm1, ymm6 + + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + + sub rcx, byte SIZEOF_YMMWORD ; outcol + add rsi, byte 2*SIZEOF_YMMWORD ; inptr + add rdi, byte 1*SIZEOF_YMMWORD ; outptr + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + test rcx, rcx + jnz near .columnloop_r24 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 6 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) + +EXTN(jsimd_h2v2_downsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdip, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax, rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + vmovd xmm7, edx + vpcmpeqw ymm6, ymm6, ymm6 + vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} + vperm2i128 ymm7, ymm7, ymm7, 0 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdip, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + +.columnloop_r24: + cmp rcx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] + vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp rcx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop: + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] + vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] + +.downsample: + vpand ymm4, ymm0, ymm6 + vpsrlw ymm0, ymm0, BYTE_BIT + vpand ymm5, ymm1, ymm6 + vpsrlw ymm1, ymm1, BYTE_BIT + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + + vpand ymm4, ymm2, ymm6 + vpsrlw ymm2, ymm2, BYTE_BIT + vpand ymm5, ymm3, ymm6 + vpsrlw ymm3, ymm3, BYTE_BIT + vpaddw ymm2, ymm2, ymm4 + vpaddw ymm3, ymm3, ymm5 + + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpsrlw ymm0, ymm0, 2 + vpsrlw ymm2, ymm2, 2 + + vpackuswb ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + + sub rcx, byte SIZEOF_YMMWORD ; outcol + add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_YMMWORD ; outptr + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .columnloop_r24 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 6 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm new file mode 100644 index 0000000000..2fcfe4567a --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm @@ -0,0 +1,330 @@ +; +; jcsample.asm - downsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdip, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax, eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx, rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args 6 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdip, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax, rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdip, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args 6 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm new file mode 100644 index 0000000000..2370fda642 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm @@ -0,0 +1,496 @@ +; +; jdcolext.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +; r10d = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) + +EXTN(jsimd_ycc_rgb_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr0 + mov rbxp, JSAMPROW [rbx] ; inptr1 + mov rdxp, JSAMPROW [rdx] ; inptr2 + mov rdip, JSAMPROW [rdi] ; outptr +.columnloop: + + vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm1, YMMWORD [rdx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm0, ymm0, ymm0 + vpcmpeqw ymm7, ymm7, ymm7 + vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..} + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO + vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE + vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO + + vpaddw ymm2, ymm4, ymm7 + vpaddw ymm3, ymm5, ymm7 + vpaddw ymm6, ymm0, ymm7 + vpaddw ymm7, ymm1, ymm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE + vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO + vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE + vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO + + vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbE * -FIX(0.22800)) + vpmulhw ymm5, ymm5, [rel PW_MF0228] ; ymm5=(2*CbO * -FIX(0.22800)) + vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrE * FIX(0.40200)) + vpmulhw ymm1, ymm1, [rel PW_F0402] ; ymm1=(2*CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, [rel PW_ONE] + vpaddw ymm5, ymm5, [rel PW_ONE] + vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800)) + vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800)) + vpaddw ymm0, ymm0, [rel PW_ONE] + vpaddw ymm1, ymm1, [rel PW_ONE] + vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200)) + vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E + vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O + vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E + vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O + + vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E + vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O + + vpunpckhwd ymm4, ymm2, ymm6 + vpunpcklwd ymm2, ymm2, ymm6 + vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285] + vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285] + vpunpckhwd ymm5, ymm3, ymm7 + vpunpcklwd ymm3, ymm3, ymm7 + vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285] + vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285] + + vpaddd ymm2, ymm2, [rel PD_ONEHALF] + vpaddd ymm4, ymm4, [rel PD_ONEHALF] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm4, ymm4, SCALEBITS + vpaddd ymm3, ymm3, [rel PD_ONEHALF] + vpaddd ymm5, ymm5, [rel PD_ONEHALF] + vpsrad ymm3, ymm3, SCALEBITS + vpsrad ymm5, ymm5, SCALEBITS + + vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + vmovdqu ymm5, YMMWORD [rsi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm4, ymm4, ymm4 + vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..} + vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO + + vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub rcx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH +.out0: + add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + cmp rcx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub rcx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp rcx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + vperm2i128 ymmA, ymmA, ymmA, 1 + add rdi, byte SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_YMMWORD/16*4 + sub rcx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + vmovd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm new file mode 100644 index 0000000000..e07c8d7518 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm @@ -0,0 +1,439 @@ +; +; jdcolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +; r10d = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsip, JSAMPROW [rsi] ; inptr0 + mov rbxp, JSAMPROW [rbx] ; inptr1 + mov rdxp, JSAMPROW [rdx] ; inptr2 + mov rdip, JSAMPROW [rdi] ; outptr +.columnloop: + + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [rel PW_ONE] + paddw xmm1, [rel PW_ONE] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm4, [rel PW_MF0344_F0285] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [rel PW_MF0344_F0285] + pmaddwd xmm5, [rel PW_MF0344_F0285] + + paddd xmm2, [rel PD_ONEHALF] + paddd xmm4, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [rel PD_ONEHALF] + paddd xmm5, [rel PD_ONEHALF] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm new file mode 100644 index 0000000000..43de9db04d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm @@ -0,0 +1,118 @@ +; +; jdcolor.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) + +EXTN(jconst_ycc_rgb_convert_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2 +%include "jdcolext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm new file mode 100644 index 0000000000..b3f1fec07e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm new file mode 100644 index 0000000000..9515a17013 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm @@ -0,0 +1,136 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_avx2) + +EXTN(jconst_merged_upsample_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgbx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgbx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgrx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgrx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm new file mode 100644 index 0000000000..aedccc20f6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm @@ -0,0 +1,135 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm new file mode 100644 index 0000000000..8b264b4f03 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm @@ -0,0 +1,596 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 3 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2) + +EXTN(jsimd_h2v1_merged_upsample_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov ecx, r10d ; col + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdip, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + vmovdqu ymm6, YMMWORD [rbx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm7, YMMWORD [rdx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpcmpeqw ymm3, ymm3, ymm3 + vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL + vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH + vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL + vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH + + vpaddw ymm5, ymm6, ymm3 + vpaddw ymm2, ymm4, ymm3 + vpaddw ymm1, ymm7, ymm3 + vpaddw ymm3, ymm0, ymm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL + vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH + vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL + + vpmulhw ymm6, ymm6, [rel PW_MF0228] ; ymm6=(2*CbH * -FIX(0.22800)) + vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbL * -FIX(0.22800)) + vpmulhw ymm7, ymm7, [rel PW_F0402] ; ymm7=(2*CrH * FIX(0.40200)) + vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, [rel PW_ONE] + vpaddw ymm4, ymm4, [rel PW_ONE] + vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800)) + vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800)) + vpaddw ymm7, ymm7, [rel PW_ONE] + vpaddw ymm0, ymm0, [rel PW_ONE] + vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200)) + vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, ymm5 + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L + vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H + vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L + + vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H + vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H + + vpunpckhwd ymm6, ymm5, ymm1 + vpunpcklwd ymm5, ymm5, ymm1 + vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285] + vpmaddwd ymm6, ymm6, [rel PW_MF0344_F0285] + vpunpckhwd ymm7, ymm2, ymm3 + vpunpcklwd ymm2, ymm2, ymm3 + vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285] + vpmaddwd ymm7, ymm7, [rel PW_MF0344_F0285] + + vpaddd ymm5, ymm5, [rel PD_ONEHALF] + vpaddd ymm6, ymm6, [rel PD_ONEHALF] + vpsrad ymm5, ymm5, SCALEBITS + vpsrad ymm6, ymm6, SCALEBITS + vpaddd ymm2, ymm2, [rel PD_ONEHALF] + vpaddd ymm7, ymm7, [rel PD_ONEHALF] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm7, ymm7, SCALEBITS + + vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H + vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H + vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H + +.Yloop_1st: + vmovdqu ymm7, YMMWORD [rsi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO + + vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H) + vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H) + vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H) + + vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub rcx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH +.out0: + add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + dec al + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + cmp rcx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub rcx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp rcx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_YMMWORD/16*4 + sub rcx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + vmovd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + vzeroupper + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2) + +EXTN(jsimd_h2v2_merged_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + sub rsp, SIZEOF_JSAMPARRAY*4 + mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 + mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 + mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_avx2) + + pop rax + pop rcx + pop rdi + mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 + mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 + mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_avx2) + + pop rax + pop rcx + pop rdi + mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] + add rsp, SIZEOF_JSAMPARRAY*4 + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm new file mode 100644 index 0000000000..eb3ab9dbd9 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm @@ -0,0 +1,538 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov ecx, r10d ; col + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdip, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [rel PW_ONE] + paddw xmm4, [rel PW_ONE] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [rel PW_ONE] + paddw xmm0, [rel PW_ONE] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [rel PW_MF0344_F0285] + pmaddwd xmm6, [rel PW_MF0344_F0285] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm7, [rel PW_MF0344_F0285] + + paddd xmm5, [rel PD_ONEHALF] + paddd xmm6, [rel PD_ONEHALF] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [rel PD_ONEHALF] + paddd xmm7, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + +.Yloop_1st: + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + sub rsp, SIZEOF_JSAMPARRAY*4 + mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 + mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 + mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 + mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 + mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] + mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] + mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] + add rsp, SIZEOF_JSAMPARRAY*4 + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm new file mode 100644 index 0000000000..1e4979f933 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm @@ -0,0 +1,696 @@ +; +; jdsample.asm - upsampling (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_avx2) + +EXTN(jconst_fancy_upsample_avx2): + +PW_ONE times 16 dw 1 +PW_TWO times 16 dw 2 +PW_THREE times 16 dw 3 +PW_SEVEN times 16 dw 7 +PW_EIGHT times 16 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) + +EXTN(jsimd_h2v1_fancy_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + push_xmm 3 + collect_args 4 + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data + + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + vpcmpeqb xmm9, xmm9, xmm9 + vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff + + vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1) + vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff + +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_YMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + add rax, byte SIZEOF_YMMWORD-1 + and rax, byte -SIZEOF_YMMWORD + cmp rax, byte SIZEOF_YMMWORD + ja short .columnloop + +.columnloop_last: + vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD] + jmp short .upsample + +.columnloop: + vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vperm2i128 ymm6, ymm0, ymm6, 0x20 + vpslldq ymm6, ymm6, 15 + +.upsample: + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) + + vperm2i128 ymm2, ymm0, ymm1, 0x20 + vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) + vperm2i128 ymm4, ymm0, ymm1, 0x03 + vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) + + vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) + vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) + + vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) + + vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) + vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) + vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) + vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) + vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vpmullw ymm1, ymm1, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + vpaddw ymm2, ymm2, [rel PW_ONE] + vpaddw ymm5, ymm5, [rel PW_ONE] + vpaddw ymm3, ymm3, [rel PW_TWO] + vpaddw ymm6, ymm6, [rel PW_TWO] + + vpaddw ymm2, ymm2, ymm1 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm3, ymm3, ymm1 + vpaddw ymm6, ymm6, ymm4 + vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm3, ymm3, BYTE_BIT + vpsllw ymm6, ymm6, BYTE_BIT + vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5 + + sub rax, byte SIZEOF_YMMWORD + add rsi, byte 1*SIZEOF_YMMWORD ; inptr + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + cmp rax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 4 + pop_xmm 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) + +EXTN(jsimd_h2v2_fancy_upsample_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + push_xmm 3 + collect_args 4 + push rbx + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's) + vpcmpeqb xmm9, xmm9, xmm9 + vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff + vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2) + vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff + + test rax, SIZEOF_YMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] + vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] + vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] + + vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6 + + vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + + vmovdqa YMMWORD [wk(0)], ymm1 + vmovdqa YMMWORD [wk(1)], ymm2 + + add rax, byte SIZEOF_YMMWORD-1 + and rax, byte -SIZEOF_YMMWORD + cmp rax, byte SIZEOF_YMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD] + vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD] + + vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] + vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] + vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] + + vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6 + + vperm2i128 ymm1, ymm8, ymm1, 0x20 + vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + vperm2i128 ymm2, ymm8, ymm2, 0x20 + vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + + vmovdqa YMMWORD [wk(2)], ymm1 + vmovdqa YMMWORD [wk(3)], ymm2 + +.upsample: + ; -- process the upper row + + vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vperm2i128 ymm0, ymm8, ymm7, 0x03 + vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm4, ymm8, ymm3, 0x20 + vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm5, ymm8, ymm7, 0x03 + vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm6, ymm8, ymm3, 0x20 + vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm2, ymm8, ymm3, 0x03 + vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm4, ymm8, ymm3, 0x03 + vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm8, ymm7, 0x20 + vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(0)], ymm4 + + vpmullw ymm7, ymm7, [rel PW_THREE] + vpmullw ymm3, ymm3, [rel PW_THREE] + vpaddw ymm1, ymm1, [rel PW_EIGHT] + vpaddw ymm5, ymm5, [rel PW_EIGHT] + vpaddw ymm0, ymm0, [rel PW_SEVEN] + vpaddw ymm2, [rel PW_SEVEN] + + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm5, ymm5, ymm3 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm0, ymm0, BYTE_BIT + vpsllw ymm2, ymm2, BYTE_BIT + vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 + + ; -- process the lower row + + vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vperm2i128 ymm7, ymm8, ymm6, 0x03 + vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm3, ymm8, ymm4, 0x20 + vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm0, ymm8, ymm6, 0x03 + vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm2, ymm8, ymm4, 0x20 + vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm5, ymm8, ymm4, 0x03 + vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm3, ymm8, ymm4, 0x03 + vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm8, ymm6, 0x20 + vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(1)], ymm3 + + vpmullw ymm6, ymm6, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + vpaddw ymm1, ymm1, [rel PW_EIGHT] + vpaddw ymm0, ymm0, [rel PW_EIGHT] + vpaddw ymm7, ymm7, [rel PW_SEVEN] + vpaddw ymm5, ymm5, [rel PW_SEVEN] + + vpaddw ymm1, ymm1, ymm6 + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm7, ymm7, ymm6 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm7, ymm7, BYTE_BIT + vpsllw ymm5, ymm5, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) + vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0 + + sub rax, byte SIZEOF_YMMWORD + add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_YMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_YMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_YMMWORD ; outptr1 + cmp rax, byte SIZEOF_YMMWORD + ja near .columnloop + test rax, rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 4 + pop_xmm 3 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) + +EXTN(jsimd_h2v1_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov edx, r11d + add rdx, byte (SIZEOF_YMMWORD-1) + and rdx, -SIZEOF_YMMWORD + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + mov rax, rdx ; colctr +.columnloop: + + cmp rax, byte SIZEOF_YMMWORD + ja near .above_16 + + vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + jmp short .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 + + sub rax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + vzeroupper + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) + +EXTN(jsimd_h2v2_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov edx, r11d + add rdx, byte (SIZEOF_YMMWORD-1) + and rdx, -SIZEOF_YMMWORD + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax, rdx ; colctr +.columnloop: + + cmp rax, byte SIZEOF_YMMWORD + ja short .above_16 + + vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + jmp near .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 + + sub rax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr + add rbx, 2*SIZEOF_YMMWORD ; outptr0 + add rdi, 2*SIZEOF_YMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm new file mode 100644 index 0000000000..38dbceec26 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm @@ -0,0 +1,665 @@ +; +; jdsample.asm - upsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample + +.columnloop: + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm2, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + paddw xmm3, [rel PW_TWO] + paddw xmm6, [rel PW_TWO] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 + + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [rel PW_THREE] + pmullw xmm3, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm5, [rel PW_EIGHT] + paddw xmm0, [rel PW_SEVEN] + paddw xmm2, [rel PW_SEVEN] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm0, [rel PW_EIGHT] + paddw xmm7, [rel PW_SEVEN] + paddw xmm5, [rel PW_SEVEN] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax, rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rdip, JSAMPROW [rdi] ; outptr + mov rax, rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdip, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsip, JSAMPROW [rsi] ; inptr + mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax, rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm new file mode 100644 index 0000000000..ef2796649b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm @@ -0,0 +1,355 @@ +; +; jfdctflt.asm - floating-point FDCT (64-bit SSE) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse(FAST_FLOAT *data) +; + +; r10 = FAST_FLOAT *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm new file mode 100644 index 0000000000..2e1bfe6e8c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm @@ -0,0 +1,389 @@ +; +; jfdctfst.asm - fast integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2(DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [rel PW_F0707] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [rel PW_F0382] ; xmm2=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [rel PW_F0707] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [rel PW_F0382] ; xmm7=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm new file mode 100644 index 0000000000..e56258b48a --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm @@ -0,0 +1,320 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + vpunpcklwd %5, %1, %2 + vpunpckhwd %6, %1, %2 + vpunpcklwd %7, %3, %4 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 1) + ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) + ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) + ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) + ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) + + vpunpckldq %1, %5, %7 + vpunpckhdq %2, %5, %7 + vpunpckldq %3, %6, %8 + vpunpckhdq %4, %6, %8 + ; transpose coefficients(phase 2) + ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpermq %1, %1, 0x8D + vpermq %2, %2, 0x8D + vpermq %3, %3, 0xD8 + vpermq %4, %4, 0xD8 + ; transpose coefficients(phase 3) + ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70) + ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 9 + vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 + vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 + vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 + vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5 + + ; -- Even part + + vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1 + vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11 + vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12 + + vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 + vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11 + vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) +%if %9 == 1 + vpsllw %1, %7, PASS1_BITS ; %1=data0_4 +%else + vpaddw %7, %7, [rel PW_DESCALE_P2X] + vpsraw %1, %7, PASS1_BITS ; %1=data0_4 +%endif + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13 + vpunpcklwd %2, %6, %7 + vpunpckhwd %6, %6, %7 + vpmaddwd %2, %2, [rel PW_F130_F054_MF130_F054] ; %2=data2_6L + vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=data2_6H + + vpaddd %2, %2, [rel PD_DESCALE_P %+ %9] + vpaddd %6, %6, [rel PD_DESCALE_P %+ %9] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %6, %6, DESCALE_P %+ %9 + + vpackssdw %3, %2, %6 ; %6=data2_6 + + ; -- Odd part + + vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3 + vpunpcklwd %6, %7, %2 + vpunpckhwd %7, %7, %2 + vpmaddwd %6, %6, [rel PW_MF078_F117_F078_F117] ; %6=z3_4L + vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6 + vpunpcklwd %2, %8, %4 + vpunpckhwd %4, %8, %4 + vpmaddwd %2, %2, [rel PW_MF060_MF089_MF050_MF256] ; %2=tmp4_5L + vpmaddwd %4, %4, [rel PW_MF060_MF089_MF050_MF256] ; %4=tmp4_5H + + vpaddd %2, %2, %6 ; %2=data7_5L + vpaddd %4, %4, %7 ; %4=data7_5H + + vpaddd %2, %2, [rel PD_DESCALE_P %+ %9] + vpaddd %4, %4, [rel PD_DESCALE_P %+ %9] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %4, %4, DESCALE_P %+ %9 + + vpackssdw %4, %2, %4 ; %4=data7_5 + + vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4 + vpunpcklwd %8, %5, %2 + vpunpckhwd %5, %5, %2 + vpmaddwd %8, %8, [rel PW_F050_MF256_F060_MF089] ; %8=tmp6_7L + vpmaddwd %5, %5, [rel PW_F050_MF256_F060_MF089] ; %5=tmp6_7H + + vpaddd %8, %8, %6 ; %8=data3_1L + vpaddd %5, %5, %7 ; %5=data3_1H + + vpaddd %8, %8, [rel PD_DESCALE_P %+ %9] + vpaddd %5, %5, [rel PD_DESCALE_P %+ %9] + vpsrad %8, %8, DESCALE_P %+ %9 + vpsrad %5, %5, DESCALE_P %+ %9 + + vpackssdw %2, %8, %5 ; %2=data3_1 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_avx2) + +EXTN(jconst_fdct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562 + times 4 dw (F_1_501 - F_0_899), -F_0_899 +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_avx2(DCTELEM *data) +; + +; r10 = DCTELEM *data + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_avx2) + +EXTN(jsimd_fdct_islow_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 1 + + ; ---- Pass 1: process rows. + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)] + ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm4, ymm6, 0x31 + vperm2i128 ymm2, ymm5, ymm7, 0x20 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 + + ; ---- Pass 2: process columns. + + vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 + vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 + + vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 + vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3 + vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5 + vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3 + vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5 + vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6 + vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7 + + vzeroupper + uncollect_args 1 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm new file mode 100644 index 0000000000..ec1f383ccb --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm @@ -0,0 +1,619 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2(DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm6, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm0, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm4, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [rel PD_DESCALE_P1] + paddd xmm3, [rel PD_DESCALE_P1] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [rel PW_DESCALE_P2X] + paddw xmm5, [rel PW_DESCALE_P2X] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm2, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm6, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm1, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [rel PD_DESCALE_P2] + paddd xmm7, [rel PD_DESCALE_P2] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm new file mode 100644 index 0000000000..60bf961896 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm @@ -0,0 +1,482 @@ +; +; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args 4 + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT *wsptr + mov rcx, DCTSIZE/4 ; ctr +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT *wsptr + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + mov rcx, DCTSIZE/4 ; ctr +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm new file mode 100644 index 0000000000..cb97fdfbb2 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm @@ -0,0 +1,491 @@ +; +; jidctfst.asm - fast integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [rel PW_F1414] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [rel PW_F1847] ; xmm5=z5 + pmulhw xmm0, [rel PW_MF1613] + pmulhw xmm2, [rel PW_F1082] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F1414] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [rel PW_F1847] ; xmm4=z5 + pmulhw xmm6, [rel PW_MF1613] + pmulhw xmm0, [rel PW_F1082] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm new file mode 100644 index 0000000000..ca7e317f6e --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm @@ -0,0 +1,418 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) + ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76) + + vpermq %5, %1, 0xD8 + vpermq %6, %2, 0x72 + vpermq %7, %3, 0xD8 + vpermq %8, %4, 0x72 + ; transpose coefficients(phase 1) + ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpunpcklwd %1, %5, %6 + vpunpckhwd %2, %5, %6 + vpunpcklwd %3, %7, %8 + vpunpckhwd %4, %7, %8 + ; transpose coefficients(phase 2) + ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72) + ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73) + ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76) + ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77) + + vpunpcklwd %5, %1, %2 + vpunpcklwd %6, %3, %4 + vpunpckhwd %7, %1, %2 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 3) + ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53) + ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57) + ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73) + ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77) + + vpunpcklqdq %1, %5, %6 + vpunpckhqdq %2, %5, %6 + vpunpcklqdq %3, %7, %8 + vpunpckhqdq %4, %7, %8 + ; transpose coefficients(phase 4) + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%12: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 13 + ; -- Even part + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2 + vpunpcklwd %5, %3, %6 ; %5=in26_62L + vpunpckhwd %6, %3, %6 ; %6=in26_62H + vpmaddwd %5, %5, [rel PW_F130_F054_MF130_F054] ; %5=tmp3_2L + vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=tmp3_2H + + vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0 + vpsignw %1, %1, [rel PW_1_NEG1] + vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4) + + vpxor %1, %1, %1 + vpunpcklwd %8, %1, %7 ; %8=tmp0_1L + vpunpckhwd %1, %1, %7 ; %1=tmp0_1H + vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS + vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS + + vpsubd %11, %8, %5 ; %11=tmp0_1L-tmp3_2L=tmp13_12L + vpaddd %9, %8, %5 ; %9=tmp0_1L+tmp3_2L=tmp10_11L + vpsubd %12, %1, %6 ; %12=tmp0_1H-tmp3_2H=tmp13_12H + vpaddd %10, %1, %6 ; %10=tmp0_1H+tmp3_2H=tmp10_11H + + ; -- Odd part + + vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3 + vpunpcklwd %7, %1, %8 ; %7=z34_43L + vpunpckhwd %8, %1, %8 ; %8=z34_43H + vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4L + vpmaddwd %8, %8, [rel PW_MF078_F117_F078_F117] ; %8=z3_4H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3 + vpunpcklwd %3, %4, %2 ; %3=in71_53L + vpunpckhwd %4, %4, %2 ; %4=in71_53H + + vpmaddwd %5, %3, [rel PW_MF060_MF089_MF050_MF256] ; %5=tmp0_1L + vpmaddwd %6, %4, [rel PW_MF060_MF089_MF050_MF256] ; %6=tmp0_1H + vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L + vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H + + vpmaddwd %3, %3, [rel PW_MF089_F060_MF256_F050] ; %3=tmp3_2L + vpmaddwd %4, %4, [rel PW_MF089_F060_MF256_F050] ; %4=tmp3_2H + vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L + vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H + vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L + vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H + + ; -- Final output stage + + vpaddd %1, %9, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L + vpaddd %2, %10, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H + vpaddd %1, %1, [rel PD_DESCALE_P %+ %13] + vpaddd %2, %2, [rel PD_DESCALE_P %+ %13] + vpsrad %1, %1, DESCALE_P %+ %13 + vpsrad %2, %2, DESCALE_P %+ %13 + vpackssdw %1, %1, %2 ; %1=data0_1 + + vpsubd %3, %9, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L + vpsubd %4, %10, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H + vpaddd %3, %3, [rel PD_DESCALE_P %+ %13] + vpaddd %4, %4, [rel PD_DESCALE_P %+ %13] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %4, %4, DESCALE_P %+ %13 + vpackssdw %4, %3, %4 ; %4=data7_6 + + vpaddd %7, %11, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L + vpaddd %8, %12, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H + vpaddd %7, %7, [rel PD_DESCALE_P %+ %13] + vpaddd %8, %8, [rel PD_DESCALE_P %+ %13] + vpsrad %7, %7, DESCALE_P %+ %13 + vpsrad %8, %8, DESCALE_P %+ %13 + vpackssdw %2, %7, %8 ; %2=data3_2 + + vpsubd %7, %11, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L + vpsubd %8, %12, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H + vpaddd %7, %7, [rel PD_DESCALE_P %+ %13] + vpaddd %8, %8, [rel PD_DESCALE_P %+ %13] + vpsrad %7, %7, DESCALE_P %+ %13 + vpsrad %8, %8, DESCALE_P %+ %13 + vpackssdw %3, %7, %8 ; %3=data4_5 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_avx2) + +EXTN(jconst_idct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899) + times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 32 db CENTERJSAMPLE +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_avx2) + +EXTN(jsimd_idct_islow_avx2): + push rbp + mov rax, rsp ; rax = original rbp + mov rbp, rsp ; rbp = aligned rbp + push_xmm 4 + collect_args 4 + + ; ---- Pass 1: process columns. + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 + mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, xmm0 + vpacksswb xmm1, xmm1, xmm1 + vpacksswb xmm1, xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)] + vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + + vpsllw xmm5, xmm5, PASS1_BITS + + vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03) + vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07) + vinserti128 ymm4, ymm4, xmm5, 1 + + vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04) + vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05) + vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06) + vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)] ; ymm4=in0_1 + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)] ; ymm5=in2_3 + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)] ; ymm6=in4_5 + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)] ; ymm7=in6_7 + vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + + vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4 + vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1 + vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 + vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 + +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows. + + vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 + vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 + + vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 + vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67 + vpaddb ymm0, ymm0, [rel PB_CENTERJSAMP] + vpaddb ymm1, ymm1, [rel PB_CENTERJSAMP] + + vextracti128 xmm6, ymm1, 1 ; xmm3=data67 + vextracti128 xmm4, ymm0, 1 ; xmm2=data45 + vextracti128 xmm2, ymm1, 0 ; xmm1=data23 + vextracti128 xmm0, ymm0, 0 ; xmm0=data01 + + vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + vzeroupper + + mov eax, r13d + + mov rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + + mov rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + mov rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + uncollect_args 4 + pop_xmm 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm new file mode 100644 index 0000000000..7aa869bc0b --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm @@ -0,0 +1,847 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm new file mode 100644 index 0000000000..4ece9d891c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm @@ -0,0 +1,574 @@ +; +; jidctred.asm - reduced-size IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847, -F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 4 dw F_1_451, -F_0_211 +PW_F362_MF127 times 4 dw F_3_624, -F_1_272 +PW_F085_MF072 times 4 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0) + + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2 + + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 + + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [rel PB_CENTERJSAMP] + + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [rel PW_F362_MF127] + pmaddwd xmm5, [rel PW_F085_MF072] + + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [rel PW_F362_MF127] + pmaddwd xmm2, [rel PW_F085_MF072] + + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 + + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [rel PW_F362_MF127] + pmaddwd xmm7, [rel PW_F085_MF072] + + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6, [rel PD_DESCALE_P2_2] + psrad xmm6, DESCALE_P2_2 + + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [rel PB_CENTERJSAMP] + + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) + + mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov word [rdx+rax*SIZEOF_JSAMPLE], bx + mov word [rsi+rax*SIZEOF_JSAMPLE], cx + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm new file mode 100644 index 0000000000..ab2e3954f6 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm @@ -0,0 +1,155 @@ +; +; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + push rbx + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/2 +.convloop: + mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop + + pop rbx + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +; r10 = JCOEFPTR coef_block +; r11 = FAST_FLOAT *divisors +; r12 = FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm new file mode 100644 index 0000000000..70fe81139c --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm @@ -0,0 +1,163 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, 2018, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov eax, r11d + + mov rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm3, ymm3, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1 + vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm new file mode 100644 index 0000000000..3ee442027a --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm @@ -0,0 +1,188 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + push rbx + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/4 +.convloop: + mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c new file mode 100644 index 0000000000..584a010ad3 --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c @@ -0,0 +1,1068 @@ +/* + * jsimd_x86_64.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "jconfigint.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ +#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ + +static unsigned int simd_support = (unsigned int)(~0); +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char env[2] = { 0 }; +#endif + + if (simd_support != ~0U) + return; + + simd_support = jpeg_simd_cpu_support(); + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1")) + simd_support &= JSIMD_SSE2; + if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1")) + simd_support &= JSIMD_AVX2; + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) + simd_support = 0; + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_ycc_convert_avx2; + sse2fct = jsimd_extrgb_ycc_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_ycc_convert_avx2; + sse2fct = jsimd_extrgbx_ycc_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_ycc_convert_avx2; + sse2fct = jsimd_extbgr_ycc_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_ycc_convert_avx2; + sse2fct = jsimd_extbgrx_ycc_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_ycc_convert_avx2; + sse2fct = jsimd_extxbgr_ycc_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_ycc_convert_avx2; + sse2fct = jsimd_extxrgb_ycc_convert_sse2; + break; + default: + avx2fct = jsimd_rgb_ycc_convert_avx2; + sse2fct = jsimd_rgb_ycc_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_gray_convert_avx2; + sse2fct = jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_gray_convert_avx2; + sse2fct = jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_gray_convert_avx2; + sse2fct = jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_gray_convert_avx2; + sse2fct = jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_gray_convert_avx2; + sse2fct = jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_gray_convert_avx2; + sse2fct = jsimd_extxrgb_gray_convert_sse2; + break; + default: + avx2fct = jsimd_rgb_gray_convert_avx2; + sse2fct = jsimd_rgb_gray_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_ycc_extrgb_convert_avx2; + sse2fct = jsimd_ycc_extrgb_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_ycc_extrgbx_convert_avx2; + sse2fct = jsimd_ycc_extrgbx_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_ycc_extbgr_convert_avx2; + sse2fct = jsimd_ycc_extbgr_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_ycc_extbgrx_convert_avx2; + sse2fct = jsimd_ycc_extbgrx_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_ycc_extxbgr_convert_avx2; + sse2fct = jsimd_ycc_extxbgr_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_ycc_extxrgb_convert_avx2; + sse2fct = jsimd_ycc_extxrgb_convert_sse2; + break; + default: + avx2fct = jsimd_ycc_rgb_convert_avx2; + sse2fct = jsimd_ycc_rgb_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2; + break; + default: + avx2fct = jsimd_h2v2_merged_upsample_avx2; + sse2fct = jsimd_h2v2_merged_upsample_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2; + break; + default: + avx2fct = jsimd_h2v1_merged_upsample_avx2; + sse2fct = jsimd_h2v1_merged_upsample_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else + jsimd_convsamp_sse2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_fdct_islow_avx2(data); + else + jsimd_fdct_islow_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ + jsimd_fdct_float_sse(data); +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else + jsimd_quantize_sse2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + jsimd_quantize_float_sse2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_AVX2) + jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm new file mode 100644 index 0000000000..705f813d7d --- /dev/null +++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm @@ -0,0 +1,86 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support(void) +; + + align 32 + GLOBAL_FUNCTION(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push rbx + push rdi + + xor rdi, rdi ; simd support flag + + ; Assume that all x86-64 processors support SSE & SSE2 instructions + or rdi, JSIMD_SSE2 + or rdi, JSIMD_SSE + + ; Check whether CPUID leaf 07H is supported + ; (leaf 07H is used to check for AVX2 instruction support) + mov rax, 0 + cpuid + cmp rax, 7 + jl short .return ; Maximum leaf < 07H + + ; Check for AVX2 instruction support + mov rax, 7 + xor rcx, rcx + cpuid + mov rax, rbx ; rax = Extended feature flags + + test rax, 1<<5 ; bit5:AVX2 + jz short .return + + ; Check for AVX2 O/S support + mov rax, 1 + xor rcx, rcx + cpuid + test rcx, 1<<27 + jz short .return ; O/S does not support XSAVE + test rcx, 1<<28 + jz short .return ; CPU does not support AVX2 + + xor rcx, rcx + xgetbv + and rax, 6 + cmp rax, 6 ; O/S does not manage XMM/YMM state + ; using XSAVE + jnz short .return + + or rdi, JSIMD_AVX2 + +.return: + mov rax, rdi + + pop rdi + pop rbx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32