diff --git a/package.json b/package.json index 7d25f7fdf..d64617d85 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "libsodium": "^0.7.9", "libsodium-wrappers": "^0.7.9", "ts-proto": "^1.101.0", + "wasm-feature-detect": "^1.2.11", "zstddec": "^0.0.2" } } diff --git a/src/codec.js b/src/codec.js index 6dc7db92a..e35f72d85 100644 --- a/src/codec.js +++ b/src/codec.js @@ -19,14 +19,17 @@ OGVDecoderVideoAV1MTW: 'ogv-decoder-video-av1-mt-wasm.js', OGVDecoderVideoAV1SIMDMTW: 'ogv-decoder-video-av1-simd-mt-wasm.js', */ +import { simd } from "wasm-feature-detect"; -export function loadVp9(callback) { +export async function loadVp9(callback) { // Multithreading is used only if `options.threading` is true. // This requires browser support for the new `SharedArrayBuffer` and `Atomics` APIs, // currently available in Firefox and Chrome with experimental flags enabled. // 所有主流浏览器均默认于2018年1月5日禁用SharedArrayBuffer + const isSIMD = await simd(); + console.log('isSIMD: ' + isSIMD); window.OGVLoader.loadClass( - "OGVDecoderVideoVP9SIMDW", + isSIMD ? "OGVDecoderVideoVP9SIMDW" : "OGVDecoderVideoVP9W", (videoCodecClass) => { window.videoCodecClass = videoCodecClass; videoCodecClass({ videoFormat: {} }).then((decoder) => { diff --git a/src/common.ts b/src/common.ts index 0355d3607..62438ddef 100644 --- a/src/common.ts +++ b/src/common.ts @@ -46,7 +46,7 @@ const zCode = "z".charCodeAt(0); const aCode = "a".charCodeAt(0); export function mapKey(name: string, isDesktop: Boolean) { - const tmp = KEY_MAP[name]; + const tmp = KEY_MAP[name] || name; if (tmp.length == 1) { const chr = tmp.charCodeAt(0); if (!isDesktop && (chr > zCode || chr < aCode)) diff --git a/src/connection.ts b/src/connection.ts index 6f75498c0..85249f45e 100644 --- a/src/connection.ts +++ b/src/connection.ts @@ -487,16 +487,16 @@ export default class Connection { ) { const key_event = mapKey(name, globals.isDesktop()); if (!key_event) return; - if (alt && name == "VK_MENU") { + if (alt && (name == "Alt" || name == 'RAlt')) { alt = false; } - if (ctrl && name == "VK_CONTROL") { + if (ctrl && (name == "Control" || name == 'RControl')) { ctrl = false; } - if (shift && name == "VK_SHIFT") { + if (shift && (name == "Shift" || name == 'RShift')) { shift = false; } - if (command && name == "Meta") { + if (command && (name == "Meta" || name == 'RWin')) { command = false; } key_event.down = down; diff --git a/src/globals.js b/src/globals.js index 78f029043..a7ceaf2b3 100644 --- a/src/globals.js +++ b/src/globals.js @@ -20,7 +20,7 @@ window.isMobile = () => { } export function isDesktop() { - return !isMobile; + return !isMobile(); } export function msgbox(type, title, text) { @@ -184,7 +184,7 @@ window.setByName = (name, value) => { break; case 'input_key': value = JSON.parse(value); - curConn.inputKey(value.name, value.down || false, value.press || false, value.alt || false, value.ctrl || false, value.shift || false, value.command || false); + curConn.inputKey(value.name, value.down == 'true', value.press == 'true', value.alt == 'true', value.ctrl == 'true', value.shift == 'true', value.command == 'true'); break; case 'input_string': curConn.inputString(value); @@ -213,7 +213,7 @@ window.setByName = (name, value) => { case 'wheel': mask |= 4 << 3; } - curConn.inputMouse(mask, parseInt(value.x || '0'), parseInt(value.y || '0'), value.alt || false, value.ctrl || false, value.shift || false, value.command || false); + curConn.inputMouse(mask, parseInt(value.x || '0'), parseInt(value.y || '0'), value.alt == 'true', value.ctrl == 'true', value.shift == 'true', value.command == 'true'); break; case 'option': value = JSON.parse(value); diff --git a/yarn.lock b/yarn.lock index 27da14f94..af7ff0b7d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -358,6 +358,11 @@ vite@^2.7.2: optionalDependencies: fsevents "~2.3.2" +wasm-feature-detect@^1.2.11: + version "1.2.11" + resolved "https://registry.yarnpkg.com/wasm-feature-detect/-/wasm-feature-detect-1.2.11.tgz#e21992fd1f1d41a47490e392a5893cb39d81e29e" + integrity sha512-HUqwaodrQGaZgz1lZaNioIkog9tkeEJjrM3eq4aUL04whXOVDRc/o2EGb/8kV0QX411iAYWEqq7fMBmJ6dKS6w== + zstddec@^0.0.2: version "0.0.2" resolved "https://registry.yarnpkg.com/zstddec/-/zstddec-0.0.2.tgz#57e2f28dd1ff56b750e07d158a43f0611ad9eeb4" diff --git a/yuv_rgb.c b/yuv_rgb.c index fb4a703d1..fdf47269b 100644 --- a/yuv_rgb.c +++ b/yuv_rgb.c @@ -9,6 +9,12 @@ typedef __i64x2 __m128i; #define _mm_load_si128 wasm_v128_load +#define _mm_add_epi8 wasm_i8x16_add +#define _mm_set1_epi8 wasm_i8x16_splat +#define _mm_srai_epi16 wasm_i16x8_shr +#define _mm_mullo_epi16 wasm_i16x8_mul +#define _mm_sub_epi8 wasm_i8x16_sub +#define _mm_setzero_si128 wasm_i64x2_const #include @@ -433,493 +439,6 @@ void nv21_rgb24_std( } } - -#ifdef __SSE2__ - -//see rgb.txt -#define UNPACK_RGB24_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RD1, RD2, RD3, RD4, RD5, RD6) \ -RD1 = _mm_unpacklo_epi8(RS1, RS4); \ -RD2 = _mm_unpackhi_epi8(RS1, RS4); \ -RD3 = _mm_unpacklo_epi8(RS2, RS5); \ -RD4 = _mm_unpackhi_epi8(RS2, RS5); \ -RD5 = _mm_unpacklo_epi8(RS3, RS6); \ -RD6 = _mm_unpackhi_epi8(RS3, RS6); - -#define RGB2YUV_16(R, G, B, Y, U, V) \ -Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(G, _mm_set1_epi16(param->g_factor))); \ -Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->b_factor))); \ -Y = _mm_srli_epi16(Y, 8); \ -U = _mm_mullo_epi16(_mm_sub_epi16(B, Y), _mm_set1_epi16(param->cb_factor)); \ -U = _mm_add_epi16(_mm_srai_epi16(U, 8), _mm_set1_epi16(128)); \ -V = _mm_mullo_epi16(_mm_sub_epi16(R, Y), _mm_set1_epi16(param->cr_factor)); \ -V = _mm_add_epi16(_mm_srai_epi16(V, 8), _mm_set1_epi16(128)); \ -Y = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(Y, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); - -#define RGB2YUV_32 \ - __m128i r_16, g_16, b_16; \ - __m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \ - __m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; \ - __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \ - rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \ - rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \ - rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \ - rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \ - rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \ - /* unpack rgb24 data to r, g and b data in separate channels*/ \ - /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ - /* here, because averaging in horizontal direction is easier like this*/ \ - /* The last step is applied further on the Y channel only*/ \ - UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ - UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ - UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ - UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ - /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ - r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb1_16 = _mm_sub_epi16(b_16, y1_16); \ - cr1_16 = _mm_sub_epi16(r_16, y1_16); \ - r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr1), Y); \ - /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ - r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \ - r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr2), Y); \ - /* Rescale Cb and Cr to their final range */ \ - cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ - cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ - \ - /* do the same again with next data */ \ - rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \ - rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \ - rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \ - rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)), \ - rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \ - rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \ - /* unpack rgb24 data to r, g and b data in separate channels*/ \ - /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ - /* here, because averaging in horizontal direction is easier like this*/ \ - /* The last step is applied further on the Y channel only*/ \ - UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ - UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ - UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ - UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ - /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ - r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb2_16 = _mm_sub_epi16(b_16, y1_16); \ - cr2_16 = _mm_sub_epi16(r_16, y1_16); \ - r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr1+16), Y); \ - /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ - r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \ - r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr2+16), Y); \ - /* Rescale Cb and Cr to their final range */ \ - cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ - cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ - /* Pack and save Cb Cr */ \ - cb = _mm_packus_epi16(cb1_16, cb2_16); \ - cr = _mm_packus_epi16(cr1_16, cr2_16); \ - SAVE_SI128((__m128i*)(u_ptr), cb); \ - SAVE_SI128((__m128i*)(v_ptr), cr); - - -void rgb24_yuv420_sse(uint32_t width, uint32_t height, - const uint8_t *RGB, uint32_t RGB_stride, - uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_load_si128 - #define SAVE_SI128 _mm_stream_si128 - const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *u_ptr=U+(y/2)*UV_stride, - *v_ptr=V+(y/2)*UV_stride; - - for(x=0; x<(width-31); x+=32) - { - RGB2YUV_32 - - rgb_ptr1+=96; - rgb_ptr2+=96; - y_ptr1+=32; - y_ptr2+=32; - u_ptr+=16; - v_ptr+=16; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -void rgb24_yuv420_sseu(uint32_t width, uint32_t height, - const uint8_t *RGB, uint32_t RGB_stride, - uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_loadu_si128 - #define SAVE_SI128 _mm_storeu_si128 - const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *u_ptr=U+(y/2)*UV_stride, - *v_ptr=V+(y/2)*UV_stride; - - for(x=0; x<(width-31); x+=32) - { - RGB2YUV_32 - - rgb_ptr1+=96; - rgb_ptr2+=96; - y_ptr1+=32; - y_ptr2+=32; - u_ptr+=16; - v_ptr+=16; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - - -// see rgba.txt -#define UNPACK_RGB32_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RS7, RS8, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8) \ -RD1 = _mm_unpacklo_epi8(RS1, RS5); \ -RD2 = _mm_unpackhi_epi8(RS1, RS5); \ -RD3 = _mm_unpacklo_epi8(RS2, RS6); \ -RD4 = _mm_unpackhi_epi8(RS2, RS6); \ -RD5 = _mm_unpacklo_epi8(RS3, RS7); \ -RD6 = _mm_unpackhi_epi8(RS3, RS7); \ -RD7 = _mm_unpacklo_epi8(RS4, RS8); \ -RD8 = _mm_unpackhi_epi8(RS4, RS8); - - -#define RGBA2YUV_32 \ - __m128i r_16, g_16, b_16; \ - __m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \ - __m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; \ - __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \ - rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \ - rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \ - rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \ - rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \ - rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \ - rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)), \ - rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \ - /* unpack rgb24 data to r, g and b data in separate channels*/ \ - /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ - /* here, because averaging in horizontal direction is easier like this*/ \ - /* The last step is applied further on the Y channel only*/ \ - UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ - UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ - UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ - UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ - /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ - r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb1_16 = _mm_sub_epi16(b_16, y1_16); \ - cr1_16 = _mm_sub_epi16(r_16, y1_16); \ - r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr1), Y); \ - /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ - r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \ - r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ - cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr2), Y); \ - /* Rescale Cb and Cr to their final range */ \ - cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ - cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ - \ - /* do the same again with next data */ \ - rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \ - rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \ - rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+96)), \ - rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+112)), \ - rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \ - rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)), \ - rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+96)), \ - rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+112)); \ - /* unpack rgb24 data to r, g and b data in separate channels*/ \ - /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ - /* here, because averaging in horizontal direction is easier like this*/ \ - /* The last step is applied further on the Y channel only*/ \ - UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ - UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ - UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ - UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ - /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ - r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb2_16 = _mm_sub_epi16(b_16, y1_16); \ - cr2_16 = _mm_sub_epi16(r_16, y1_16); \ - r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ - g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ - b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr1+16), Y); \ - /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ - /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ - r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ - y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y1_16 = _mm_srli_epi16(y1_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \ - r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ - g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ - b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \ - y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ - _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ - y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ - y2_16 = _mm_srli_epi16(y2_16, 8); \ - cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ - cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ - /* Rescale Y' to Y, pack it to 8bit values and save it */ \ - y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ - Y = _mm_packus_epi16(y1_16, y2_16); \ - Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ - SAVE_SI128((__m128i*)(y_ptr2+16), Y); \ - /* Rescale Cb and Cr to their final range */ \ - cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ - cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ - /* Pack and save Cb Cr */ \ - cb = _mm_packus_epi16(cb1_16, cb2_16); \ - cr = _mm_packus_epi16(cr1_16, cr2_16); \ - SAVE_SI128((__m128i*)(u_ptr), cb); \ - SAVE_SI128((__m128i*)(v_ptr), cr); - -void rgb32_yuv420_sse(uint32_t width, uint32_t height, - const uint8_t *RGBA, uint32_t RGBA_stride, - uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_load_si128 - #define SAVE_SI128 _mm_stream_si128 - const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, - *rgb_ptr2=RGBA+(y+1)*RGBA_stride; - - uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *u_ptr=U+(y/2)*UV_stride, - *v_ptr=V+(y/2)*UV_stride; - - for(x=0; x<(width-31); x+=32) - { - RGBA2YUV_32 - - rgb_ptr1+=128; - rgb_ptr2+=128; - y_ptr1+=32; - y_ptr2+=32; - u_ptr+=16; - v_ptr+=16; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -void rgb32_yuv420_sseu(uint32_t width, uint32_t height, - const uint8_t *RGBA, uint32_t RGBA_stride, - uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_loadu_si128 - #define SAVE_SI128 _mm_storeu_si128 - const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, - *rgb_ptr2=RGBA+(y+1)*RGBA_stride; - - uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *u_ptr=U+(y/2)*UV_stride, - *v_ptr=V+(y/2)*UV_stride; - - for(x=0; x<(width-31); x+=32) - { - RGBA2YUV_32 - - rgb_ptr1+=128; - rgb_ptr2+=128; - y_ptr1+=32; - y_ptr2+=32; - u_ptr+=16; - v_ptr+=16; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -#endif - #ifdef __SSE2__ #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ @@ -965,22 +484,6 @@ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ __m128i u = LOAD_SI128((const __m128i*)(u_ptr)); \ __m128i v = LOAD_SI128((const __m128i*)(v_ptr)); \ -#define LOAD_UV_NV12 \ - __m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \ - __m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \ - __m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ - uv1 = _mm_srli_epi16(uv1, 8); \ - uv2 = _mm_srli_epi16(uv2, 8); \ - __m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ - -#define LOAD_UV_NV21 \ - __m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \ - __m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \ - __m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ - uv1 = _mm_srli_epi16(uv1, 8); \ - uv2 = _mm_srli_epi16(uv2, 8); \ - __m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ - #define YUV2RGB_32 \ __m128i r_tmp, g_tmp, b_tmp; \ __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ @@ -1080,15 +583,6 @@ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ LOAD_UV_PLANAR \ YUV2RGB_32 -#define YUV2RGB_32_NV12 \ - LOAD_UV_NV12 \ - YUV2RGB_32 - -#define YUV2RGB_32_NV21 \ - LOAD_UV_NV21 \ - YUV2RGB_32 - - void yuv420_rgb24_sse( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, @@ -1163,146 +657,4 @@ void yuv420_rgb24_sseu( #undef SAVE_SI128 } -void nv12_rgb24_sse( - uint32_t width, uint32_t height, - const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, - uint8_t *RGB, uint32_t RGB_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_load_si128 - #define SAVE_SI128 _mm_stream_si128 - const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *uv_ptr=UV+(y/2)*UV_stride; - - uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - for(x=0; x<(width-31); x+=32) - { - YUV2RGB_32_NV12 - - y_ptr1+=32; - y_ptr2+=32; - uv_ptr+=32; - rgb_ptr1+=96; - rgb_ptr2+=96; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -void nv12_rgb24_sseu( - uint32_t width, uint32_t height, - const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, - uint8_t *RGB, uint32_t RGB_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_loadu_si128 - #define SAVE_SI128 _mm_storeu_si128 - const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *uv_ptr=UV+(y/2)*UV_stride; - - uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - for(x=0; x<(width-31); x+=32) - { - YUV2RGB_32_NV12 - - y_ptr1+=32; - y_ptr2+=32; - uv_ptr+=32; - rgb_ptr1+=96; - rgb_ptr2+=96; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -void nv21_rgb24_sse( - uint32_t width, uint32_t height, - const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, - uint8_t *RGB, uint32_t RGB_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_load_si128 - #define SAVE_SI128 _mm_stream_si128 - const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *uv_ptr=UV+(y/2)*UV_stride; - - uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - for(x=0; x<(width-31); x+=32) - { - YUV2RGB_32_NV21 - - y_ptr1+=32; - y_ptr2+=32; - uv_ptr+=32; - rgb_ptr1+=96; - rgb_ptr2+=96; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - -void nv21_rgb24_sseu( - uint32_t width, uint32_t height, - const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, - uint8_t *RGB, uint32_t RGB_stride, - YCbCrType yuv_type) -{ - #define LOAD_SI128 _mm_loadu_si128 - #define SAVE_SI128 _mm_storeu_si128 - const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); - - uint32_t x, y; - for(y=0; y<(height-1); y+=2) - { - const uint8_t *y_ptr1=Y+y*Y_stride, - *y_ptr2=Y+(y+1)*Y_stride, - *uv_ptr=UV+(y/2)*UV_stride; - - uint8_t *rgb_ptr1=RGB+y*RGB_stride, - *rgb_ptr2=RGB+(y+1)*RGB_stride; - - for(x=0; x<(width-31); x+=32) - { - YUV2RGB_32_NV21 - - y_ptr1+=32; - y_ptr2+=32; - uv_ptr+=32; - rgb_ptr1+=96; - rgb_ptr2+=96; - } - } - #undef LOAD_SI128 - #undef SAVE_SI128 -} - - - #endif //__SSE2__