// Copyright 2016 Adrien Descamps // Distributed under BSD 3-Clause License #include "yuv_rgb.h" //#include // #include typedef __i64x2 __m128i; #define _mm_load_si128 wasm_v128_load #define _mm_add_epi8 wasm_i8x16_add #define _mm_set1_epi8 wasm_i8x16_splat #define _mm_srai_epi16 wasm_i16x8_shr #define _mm_mullo_epi16 wasm_i16x8_mul #define _mm_sub_epi8 wasm_i8x16_sub #define _mm_setzero_si128 wasm_i64x2_const #include uint8_t clamp(int16_t value) { return value<0 ? 0 : (value>255 ? 255 : value); } // Definitions // // E'R, E'G, E'B, E'Y, E'Cb and E'Cr refer to the analog signals // E'R, E'G, E'B and E'Y range is [0:1], while E'Cb and E'Cr range is [-0.5:0.5] // R, G, B, Y, Cb and Cr refer to the digitalized values // The digitalized values can use their full range ([0:255] for 8bit values), // or a subrange (typically [16:235] for Y and [16:240] for CbCr). // We assume here that RGB range is always [0:255], since it is the case for // most digitalized images. // For 8bit values : // * Y = round((YMax-YMin)*E'Y + YMin) // * Cb = round((CbRange)*E'Cb + 128) // * Cr = round((CrRange)*E'Cr + 128) // Where *Min and *Max are the range of each channel // // In the analog domain , the RGB to YCbCr transformation is defined as: // * E'Y = Rf*E'R + Gf*E'G + Bf*E'B // Where Rf, Gf and Bf are constants defined in each standard, with // Rf + Gf + Bf = 1 (necessary to ensure that E'Y range is [0:1]) // * E'Cb = (E'B - E'Y) / CbNorm // * E'Cr = (E'R - E'Y) / CrNorm // Where CbNorm and CrNorm are constants, dependent of Rf, Gf, Bf, computed // to normalize to a [-0.5:0.5] range : CbNorm=2*(1-Bf) and CrNorm=2*(1-Rf) // // Algorithms // // Most operations will be made in a fixed point format for speed, using // N bits of precision. In next section the [x] convention is used for // a fixed point rounded value, that is (int being the c type conversion) // * [x] = int(x*(2^N)+0.5) // N can be different for each factor, we simply use the highest value // that will not overflow in 16 bits intermediate variables. //. // For RGB to YCbCr conversion, we start by generating a pseudo Y value // (noted Y') in fixed point format, using the full range for now. // * Y' = ([Rf]*R + [Gf]*G + [Bf]*B)>>N // We can then compute Cb and Cr by // * Cb = ((B - Y')*[CbRange/(255*CbNorm)])>>N + 128 // * Cr = ((R - Y')*[CrRange/(255*CrNorm)])>>N + 128 // And finally, we normalize Y to its digital range // * Y = (Y'*[(YMax-YMin)/255])>>N + YMin // // For YCbCr to RGB conversion, we first compute the full range Y' value : // * Y' = ((Y-YMin)*[255/(YMax-YMin)])>>N // We can then compute B and R values by : // * B = ((Cb-128)*[(255*CbNorm)/CbRange])>>N + Y' // * R = ((Cr-128)*[(255*CrNorm)/CrRange])>>N + Y' // And finally, for G we know that: // * G = (Y' - (Rf*R + Bf*B)) / Gf // From above: // * G = (Y' - Rf * ((Cr-128)*(255*CrNorm)/CrRange + Y') - Bf * ((Cb-128)*(255*CbNorm)/CbRange + Y')) / Gf // Since 1-Rf-Bf=Gf, we can take Y' out of the division by Gf, and we get: // * G = Y' - (Cr-128)*Rf/Gf*(255*CrNorm)/CrRange - (Cb-128)*Bf/Gf*(255*CbNorm)/CbRange // That we can compute, with fixed point arithmetic, by // * G = Y' - ((Cr-128)*[Rf/Gf*(255*CrNorm)/CrRange] + (Cb-128)*[Bf/Gf*(255*CbNorm)/CbRange])>>N // // Note : in ITU-T T.871(JPEG), Y=Y', so that part could be optimized out #define FIXED_POINT_VALUE(value, precision) ((int)(((value)*(1<r_factor*rgb_ptr1[0] + param->g_factor*rgb_ptr1[1] + param->b_factor*rgb_ptr1[2])>>8; u_tmp = rgb_ptr1[2]-y_tmp; v_tmp = rgb_ptr1[0]-y_tmp; y_ptr1[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr1[3] + param->g_factor*rgb_ptr1[4] + param->b_factor*rgb_ptr1[5])>>8; u_tmp += rgb_ptr1[5]-y_tmp; v_tmp += rgb_ptr1[3]-y_tmp; y_ptr1[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr2[0] + param->g_factor*rgb_ptr2[1] + param->b_factor*rgb_ptr2[2])>>8; u_tmp += rgb_ptr2[2]-y_tmp; v_tmp += rgb_ptr2[0]-y_tmp; y_ptr2[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr2[3] + param->g_factor*rgb_ptr2[4] + param->b_factor*rgb_ptr2[5])>>8; u_tmp += rgb_ptr2[5]-y_tmp; v_tmp += rgb_ptr2[3]-y_tmp; y_ptr2[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; u_ptr[0] = (((u_tmp>>2)*param->cb_factor)>>8) + 128; v_ptr[0] = (((v_tmp>>2)*param->cb_factor)>>8) + 128; rgb_ptr1 += 6; rgb_ptr2 += 6; y_ptr1 += 2; y_ptr2 += 2; u_ptr += 1; v_ptr += 1; } } } void rgb32_yuv420_std( uint32_t width, uint32_t height, const uint8_t *RGBA, uint32_t RGBA_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, YCbCrType yuv_type) { const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, *rgb_ptr2=RGBA+(y+1)*RGBA_stride; uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *u_ptr=U+(y/2)*UV_stride, *v_ptr=V+(y/2)*UV_stride; for(x=0; x<(width-1); x+=2) { // compute yuv for the four pixels, u and v values are summed uint8_t y_tmp; int16_t u_tmp, v_tmp; y_tmp = (param->r_factor*rgb_ptr1[0] + param->g_factor*rgb_ptr1[1] + param->b_factor*rgb_ptr1[2])>>8; u_tmp = rgb_ptr1[2]-y_tmp; v_tmp = rgb_ptr1[0]-y_tmp; y_ptr1[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr1[4] + param->g_factor*rgb_ptr1[5] + param->b_factor*rgb_ptr1[6])>>8; u_tmp += rgb_ptr1[6]-y_tmp; v_tmp += rgb_ptr1[4]-y_tmp; y_ptr1[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr2[0] + param->g_factor*rgb_ptr2[1] + param->b_factor*rgb_ptr2[2])>>8; u_tmp += rgb_ptr2[2]-y_tmp; v_tmp += rgb_ptr2[0]-y_tmp; y_ptr2[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; y_tmp = (param->r_factor*rgb_ptr2[4] + param->g_factor*rgb_ptr2[5] + param->b_factor*rgb_ptr2[6])>>8; u_tmp += rgb_ptr2[6]-y_tmp; v_tmp += rgb_ptr2[4]-y_tmp; y_ptr2[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; u_ptr[0] = (((u_tmp>>2)*param->cb_factor)>>8) + 128; v_ptr[0] = (((v_tmp>>2)*param->cb_factor)>>8) + 128; rgb_ptr1 += 8; rgb_ptr2 += 8; y_ptr1 += 2; y_ptr2 += 2; u_ptr += 1; v_ptr += 1; } } } void yuv420_rgb24_std( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type) { const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *u_ptr=U+(y/2)*UV_stride, *v_ptr=V+(y/2)*UV_stride; uint8_t *rgb_ptr1=RGB+y*RGB_stride, *rgb_ptr2=RGB+(y+1)*RGB_stride; for(x=0; x<(width-1); x+=2) { int8_t u_tmp, v_tmp; u_tmp = u_ptr[0]-128; v_tmp = v_ptr[0]-128; //compute Cb Cr color offsets, common to four pixels int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; b_cb_offset = (param->cb_factor*u_tmp)>>6; r_cr_offset = (param->cr_factor*v_tmp)>>6; g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; int16_t y_tmp; y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; rgb_ptr1[2] = clamp(y_tmp + r_cr_offset); rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[0] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; rgb_ptr1[6] = clamp(y_tmp + r_cr_offset); rgb_ptr1[5] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[4] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; rgb_ptr2[2] = clamp(y_tmp + r_cr_offset); rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[0] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; rgb_ptr2[6] = clamp(y_tmp + r_cr_offset); rgb_ptr2[5] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[4] = clamp(y_tmp + b_cb_offset); rgb_ptr1 += 8; rgb_ptr2 += 8; y_ptr1 += 2; y_ptr2 += 2; u_ptr += 1; v_ptr += 1; } } } void nv12_rgb24_std( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type) { const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *uv_ptr=UV+(y/2)*UV_stride; uint8_t *rgb_ptr1=RGB+y*RGB_stride, *rgb_ptr2=RGB+(y+1)*RGB_stride; for(x=0; x<(width-1); x+=2) { int8_t u_tmp, v_tmp; u_tmp = uv_ptr[0]-128; v_tmp = uv_ptr[1]-128; //compute Cb Cr color offsets, common to four pixels int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; b_cb_offset = (param->cb_factor*u_tmp)>>6; r_cr_offset = (param->cr_factor*v_tmp)>>6; g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; int16_t y_tmp; y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; rgb_ptr1[0] = clamp(y_tmp + r_cr_offset); rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[2] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; rgb_ptr1[3] = clamp(y_tmp + r_cr_offset); rgb_ptr1[4] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[5] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; rgb_ptr2[0] = clamp(y_tmp + r_cr_offset); rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[2] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; rgb_ptr2[3] = clamp(y_tmp + r_cr_offset); rgb_ptr2[4] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[5] = clamp(y_tmp + b_cb_offset); rgb_ptr1 += 6; rgb_ptr2 += 6; y_ptr1 += 2; y_ptr2 += 2; uv_ptr += 2; } } } void nv21_rgb24_std( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type) { const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *uv_ptr=UV+(y/2)*UV_stride; uint8_t *rgb_ptr1=RGB+y*RGB_stride, *rgb_ptr2=RGB+(y+1)*RGB_stride; for(x=0; x<(width-1); x+=2) { int8_t u_tmp, v_tmp; u_tmp = uv_ptr[1]-128; v_tmp = uv_ptr[0]-128; //compute Cb Cr color offsets, common to four pixels int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; b_cb_offset = (param->cb_factor*u_tmp)>>6; r_cr_offset = (param->cr_factor*v_tmp)>>6; g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; int16_t y_tmp; y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; rgb_ptr1[0] = clamp(y_tmp + r_cr_offset); rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[2] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; rgb_ptr1[3] = clamp(y_tmp + r_cr_offset); rgb_ptr1[4] = clamp(y_tmp - g_cbcr_offset); rgb_ptr1[5] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; rgb_ptr2[0] = clamp(y_tmp + r_cr_offset); rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[2] = clamp(y_tmp + b_cb_offset); y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; rgb_ptr2[3] = clamp(y_tmp + r_cr_offset); rgb_ptr2[4] = clamp(y_tmp - g_cbcr_offset); rgb_ptr2[5] = clamp(y_tmp + b_cb_offset); rgb_ptr1 += 6; rgb_ptr2 += 6; y_ptr1 += 2; y_ptr2 += 2; uv_ptr += 2; } } } #ifdef __SSE2__ #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ r_tmp = _mm_srai_epi16(_mm_mullo_epi16(V, _mm_set1_epi16(param->cr_factor)), 6); \ g_tmp = _mm_srai_epi16(_mm_add_epi16( \ _mm_mullo_epi16(U, _mm_set1_epi16(param->g_cb_factor)), \ _mm_mullo_epi16(V, _mm_set1_epi16(param->g_cr_factor))), 7); \ b_tmp = _mm_srai_epi16(_mm_mullo_epi16(U, _mm_set1_epi16(param->cb_factor)), 6); \ R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \ G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \ B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \ R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \ G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \ B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \ #define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \ Y1 = _mm_srai_epi16(_mm_mullo_epi16(Y1, _mm_set1_epi16(param->y_factor)), 7); \ Y2 = _mm_srai_epi16(_mm_mullo_epi16(Y2, _mm_set1_epi16(param->y_factor)), 7); \ \ R1 = _mm_add_epi16(Y1, R1); \ G1 = _mm_sub_epi16(Y1, G1); \ B1 = _mm_add_epi16(Y1, B1); \ R2 = _mm_add_epi16(Y2, R2); \ G2 = _mm_sub_epi16(Y2, G2); \ B2 = _mm_add_epi16(Y2, B2); \ #define PACK_RGB24_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RD1, RD2, RD3, RD4, RD5, RD6) \ RD1 = _mm_packus_epi16(_mm_and_si128(RS1,_mm_set1_epi16(0xFF)), _mm_and_si128(RS2,_mm_set1_epi16(0xFF))); \ RD2 = _mm_packus_epi16(_mm_and_si128(RS3,_mm_set1_epi16(0xFF)), _mm_and_si128(RS4,_mm_set1_epi16(0xFF))); \ RD3 = _mm_packus_epi16(_mm_and_si128(RS5,_mm_set1_epi16(0xFF)), _mm_and_si128(RS6,_mm_set1_epi16(0xFF))); \ RD4 = _mm_packus_epi16(_mm_srli_epi16(RS1,8), _mm_srli_epi16(RS2,8)); \ RD5 = _mm_packus_epi16(_mm_srli_epi16(RS3,8), _mm_srli_epi16(RS4,8)); \ RD6 = _mm_packus_epi16(_mm_srli_epi16(RS5,8), _mm_srli_epi16(RS6,8)); \ #define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ PACK_RGB24_32_STEP(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ PACK_RGB24_32_STEP(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ #define LOAD_UV_PLANAR \ __m128i u = LOAD_SI128((const __m128i*)(u_ptr)); \ __m128i v = LOAD_SI128((const __m128i*)(v_ptr)); \ #define YUV2RGB_32 \ __m128i r_tmp, g_tmp, b_tmp; \ __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \ __m128i y_16_1, y_16_2; \ \ u = _mm_add_epi8(u, _mm_set1_epi8(-128)); \ v = _mm_add_epi8(v, _mm_set1_epi8(-128)); \ \ /* process first 16 pixels of first line */\ __m128i u_16 = _mm_srai_epi16(_mm_unpacklo_epi8(u, u), 8); \ __m128i v_16 = _mm_srai_epi16(_mm_unpacklo_epi8(v, v), 8); \ \ UV2RGB_16(u_16, v_16, r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2) \ r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ \ __m128i y = LOAD_SI128((const __m128i*)(y_ptr1)); \ y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ \ ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ \ __m128i r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \ __m128i g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \ __m128i b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \ \ /* process first 16 pixels of second line */\ r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ \ y = LOAD_SI128((const __m128i*)(y_ptr2)); \ y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ \ ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ \ __m128i r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \ __m128i g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \ __m128i b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \ \ /* process last 16 pixels of first line */\ u_16 = _mm_srai_epi16(_mm_unpackhi_epi8(u, u), 8); \ v_16 = _mm_srai_epi16(_mm_unpackhi_epi8(v, v), 8); \ \ UV2RGB_16(u_16, v_16, r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2) \ r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ \ y = LOAD_SI128((const __m128i*)(y_ptr1+16)); \ y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ \ ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ \ __m128i r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \ __m128i g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \ __m128i b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \ \ /* process last 16 pixels of second line */\ r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ \ y = LOAD_SI128((const __m128i*)(y_ptr2+16)); \ y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ \ ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ \ __m128i r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \ __m128i g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \ __m128i b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \ \ __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \ \ PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ \ PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ SAVE_SI128((__m128i*)(rgb_ptr2), rgb_1); \ SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_2); \ SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_3); \ SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_4); \ SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_5); \ SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_6); \ #define YUV2RGB_32_PLANAR \ LOAD_UV_PLANAR \ YUV2RGB_32 void yuv420_rgb24_sse( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type) { #define LOAD_SI128 _mm_load_si128 #define SAVE_SI128 _mm_stream_si128 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *u_ptr=U+(y/2)*UV_stride, *v_ptr=V+(y/2)*UV_stride; uint8_t *rgb_ptr1=RGB+y*RGB_stride, *rgb_ptr2=RGB+(y+1)*RGB_stride; for(x=0; x<(width-31); x+=32) { YUV2RGB_32_PLANAR y_ptr1+=32; y_ptr2+=32; u_ptr+=16; v_ptr+=16; rgb_ptr1+=96; rgb_ptr2+=96; } } #undef LOAD_SI128 #undef SAVE_SI128 } void yuv420_rgb24_sseu( uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type) { #define LOAD_SI128 _mm_loadu_si128 #define SAVE_SI128 _mm_storeu_si128 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); uint32_t x, y; for(y=0; y<(height-1); y+=2) { const uint8_t *y_ptr1=Y+y*Y_stride, *y_ptr2=Y+(y+1)*Y_stride, *u_ptr=U+(y/2)*UV_stride, *v_ptr=V+(y/2)*UV_stride; uint8_t *rgb_ptr1=RGB+y*RGB_stride, *rgb_ptr2=RGB+(y+1)*RGB_stride; for(x=0; x<(width-31); x+=32) { YUV2RGB_32_PLANAR y_ptr1+=32; y_ptr2+=32; u_ptr+=16; v_ptr+=16; rgb_ptr1+=96; rgb_ptr2+=96; } } #undef LOAD_SI128 #undef SAVE_SI128 } #endif //__SSE2__