YUV2RGB源码详解(参考Opencv4.1)
目錄
引言
知識直通車:
YUV2RGB原語
YUV2RGB NEON加速
引言
opencv4.x版本開始對YUV2RGB做了neon加速,這篇文章對轉(zhuǎn)換源碼進行了詳細分析,想要了解實現(xiàn)細節(jié)的同學(xué)可以做個了解,也比較簡單。
?
知識直通車:
對YUV結(jié)構(gòu)不了解的看這篇:https://blog.csdn.net/xjhhjx/article/details/80291465
對YUV2RGB不了解的看這篇:https://blog.csdn.net/xiaoyafang123/article/details/82153279
?
YUV2RGB原語
/*********************************************************************** 入?yún)?#xff1a;unsigned char* dst_data:目標(biāo)圖像指針 size_t dst_step:目標(biāo)圖像每行間隔數(shù)據(jù)的大小=通道數(shù)x寬度int dst_width:目標(biāo)圖像寬度int dst_height:目標(biāo)圖像高度size_t src_step:源圖像每行間隔數(shù)據(jù)的大小=通道數(shù)x寬度 const unsigned char* y1:源圖像y數(shù)據(jù)指針const unsigned char* uv:源圖像uv數(shù)據(jù)指針 *************************************************************************/ inline void cvtYUV420sp2RGB(unsigned char* dst_data, size_t dst_step, int dst_width, int dst_height, size_t src_step, const unsigned char* y1, const unsigned char* uv) {for (int j = 0; j < dst_height; j += 2, y1 += (src_step << 1), uv += src_step) {unsigned char* row1 = dst_data + dst_step * j; //目標(biāo)圖像當(dāng)前第一行數(shù)據(jù)指針unsigned char* row2 = dst_data + dst_step * (j + 1); //目標(biāo)圖像當(dāng)前第二行數(shù)據(jù)指針const unsigned char* y2 = y1 + src_step; //源圖像當(dāng)前第二行數(shù)據(jù)指針int i = 0;//每次求得目標(biāo)圖像的4個像素值(兩行,每行兩個,每個像素儲存rgb三個值,row+6)for (; i < dst_width; i += 2, row1 += 6, row2 += 6){//uIdx決定uv的存儲順序,按照YUV格式?jīng)Q定,NV12為UV的存儲順序,NV21為VU的存儲順序unsigned char u = uv[i + 0 + uIdx]; unsigned char v = uv[i + 1 - uIdx];unsigned char vy01 = y1[i];unsigned char vy11 = y1[i + 1];unsigned char vy02 = y2[i];unsigned char vy12 = y2[i + 1];//uv+y轉(zhuǎn)換rgb主函數(shù)cvtYuv42xxp2RGB8<bIdx, dcn, true>(u, v, vy01, vy11, vy02, vy12, row1, row2);}} }上面為YUV2RGB?的主函數(shù),思路很簡單啊:
上下行分別2個y共用一個UV,那么計算的時候直接通過原圖像的第一行y1及第二行y2的指針再加上uv,
即可求得目標(biāo)圖像的4個像素的rgb值,分別對于代碼的row1及row2(rgb值按通道排列即hwc格式因此for循環(huán)每次+6,6=2x3)詳細的解釋可參考注釋。
template<int bIdx, int dcn, bool is420> static inline void cvtYuv42xxp2RGB8(const unsigned char u, const unsigned char v,const unsigned char vy01, const unsigned char vy11, const unsigned char vy02, const unsigned char vy12,unsigned char* row1, unsigned char* row2) {int ruv, guv, buv;//計算rgb中與uv相關(guān)的分量ruv、guv、buvuvToRGBuv(u, v, ruv, guv, buv);unsigned char r00, g00, b00, a00;unsigned char r01, g01, b01, a01;//結(jié)合y以及uv相關(guān)的分量ruv、guv、buv計算最終的rgb分量的值yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00);yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01);//bIdx為0則為bgr格式,bIdx為2則為rgb格式row1[2 - bIdx] = r00;row1[1] = g00;row1[bIdx] = b00;if (dcn == 4)row1[3] = a00;//如果轉(zhuǎn)換為rgba格式,a通道賦值為0xffrow1[dcn + 2 - bIdx] = r01;row1[dcn + 1] = g01;row1[dcn + 0 + bIdx] = b01;if (dcn == 4)row1[7] = a01;//如果源圖片為420采樣模式,代表4個y共用uv,因此需要計算第二行的像素值,//若為422或者444采樣格式則不需要計算,具體可以參考上面給的直通車鏈接if (is420){unsigned char r10, g10, b10, a10;unsigned char r11, g11, b11, a11;yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10);yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11);row2[2 - bIdx] = r10;row2[1] = g10;row2[bIdx] = b10;if (dcn == 4)row2[3] = a10;row2[dcn + 2 - bIdx] = r11;row2[dcn + 1] = g11;row2[dcn + 0 + bIdx] = b11;if (dcn == 4)row2[7] = a11;} }本段代碼實現(xiàn)了uv+y轉(zhuǎn)rgb的功能,相關(guān)注釋已經(jīng)很清楚了,內(nèi)部主要包含的兩個函數(shù):uvToRGBuv、yRGBuvToRGBA。uvToRGBuv的功能主要是將uv值轉(zhuǎn)換為最終rgb公式中與uv相關(guān)的分量;yRGBuvToRGBA的功能是將uvToRGBuv求得的ruv/guv/buv分量結(jié)合y得到最終的rgb分量的值。下面分別介紹這兩個函數(shù):
//R = 1.164(Y - 16) + 1.596(V - 128) //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) //B = 1.164(Y - 16) + 2.018(U - 128)//定點處理,將各個系數(shù)乘以2^20,加1<<19四舍五入 //R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 //B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20static inline void uvToRGBuv(const unsigned char u, const unsigned char v, int& ruv, int& guv, int& buv) {int uu, vv;uu = int(u) - 128;vv = int(v) - 128;//const int ITUR_BT_601_CY = 1220542;//const int ITUR_BT_601_CUB = 2116026;//const int ITUR_BT_601_CUG = -409993;//const int ITUR_BT_601_CVG = -852492;//const int ITUR_BT_601_CVR = 1673527;//const int ITUR_BT_601_SHIFT = 20;//計算rgb中uv分量ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; } static inline void yRGBuvToRGBA(const unsigned char vy, const int ruv, const int guv, const int buv,unsigned char& r, unsigned char& g, unsigned char& b, unsigned char& a) {int yy = int(vy);//y-16之后要做飽和處理int y = maxValue(0, yy - 16) * ITUR_BT_601_CY; r = saturate_cast<unsigned char>((y + ruv) >> ITUR_BT_601_SHIFT);//除以2^20,還原g = saturate_cast<unsigned char>((y + guv) >> ITUR_BT_601_SHIFT);b = saturate_cast<unsigned char>((y + buv) >> ITUR_BT_601_SHIFT);a = (unsigned char)(0xff); }上面兩段代碼意思很簡單了,就是利用y+uv根據(jù)轉(zhuǎn)換矩陣計算rgb分量。
需要注意兩點:1、對浮點運算做了定點,乘以2^20轉(zhuǎn)換為int,最后將結(jié)果再除以2^20
? ? ? ? ? ? ? ? ? ? ? ? ? 2、y-16之后要做飽和處理,不然最后轉(zhuǎn)換出來的圖像灰度小的地方就是亮點
?
YUV2RGB NEON加速
uint8x16_t a = vdupq_n_u8((unsigned char)(0xFF)); for (; i <= dst_width - (u8_nlanes << 1); i += (u8_nlanes << 1), row1 += (u8_nlanes*dcn << 1), row2 += (u8_nlanes*dcn << 1)) {uint8x16_t u, v;v_load_deinterleave(uv + i, u, v);//分別加載16個u及16個v,uv分別放于兩個neon寄存器if (uIdx) swap(u, v);//參考原語邏輯uint8x16_t vy[4];v_load_deinterleave(y1 + i, vy[0], vy[1]);//分別加載16個u及16個v,uv分別放于兩個neon寄存器v_load_deinterleave(y2 + i, vy[2], vy[3]);int32x4_t ruv[4], guv[4], buv[4]; uvToRGBuv(u, v, ruv, guv, buv); //每對uv計算得到一組ruv、guv、buv;16對uv產(chǎn)生16組數(shù)據(jù)uint8x16_t r[4], g[4], b[4];for (int k = 0; k < 4; k++){//同樣利用ruv、guv、buv,計算最終的rgb值yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);}if (bIdx){for (int k = 0; k < 4; k++)swap(r[k], b[k]);}// [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]uint8x16_t r0_0, r0_1, r1_0, r1_1;v_zip(r[0], r[1], r0_0, r0_1);v_zip(r[2], r[3], r1_0, r1_1);uint8x16_t g0_0, g0_1, g1_0, g1_1;v_zip(g[0], g[1], g0_0, g0_1);v_zip(g[2], g[3], g1_0, g1_1);uint8x16_t b0_0, b0_1, b1_0, b1_1;v_zip(b[0], b[1], b0_0, b0_1);v_zip(b[2], b[3], b1_0, b1_1);if (dcn == 4){v_store_interleave(row1 + 0 * u8_nlanes, b0_0, g0_0, r0_0, a);v_store_interleave(row1 + 4 * u8_nlanes, b0_1, g0_1, r0_1, a);v_store_interleave(row2 + 0 * u8_nlanes, b1_0, g1_0, r1_0, a);v_store_interleave(row2 + 4 * u8_nlanes, b1_1, g1_1, r1_1, a);}else //dcn == 3{v_store_interleave(row1 + 0 * u8_nlanes, b0_0, g0_0, r0_0);v_store_interleave(row1 + 3 * u8_nlanes, b0_1, g0_1, r0_1);v_store_interleave(row2 + 0 * u8_nlanes, b1_0, g1_0, r1_0);v_store_interleave(row2 + 3 * u8_nlanes, b1_1, g1_1, r1_1);} }neon加速主要是利用單指令執(zhí)行可并行執(zhí)行的部分,在YUV2RGB轉(zhuǎn)換中,像素與像素之間的計算都是無關(guān)的,因此多個像素的計算完全可以并行執(zhí)行,主要考慮最大化的利用neon寄存器即可。
從上面代碼可以看出,每次計算目標(biāo)圖像的64個像素,分為兩行,每行32個像素。由于4個y共用uv,因此每次計算需要16組uv值。
理解了原語的代碼邏輯再理解neon加速的版本就很容易了,邏輯都是一樣的。這里主要說明一下neon指令涉及到的計算,不理解的地方上面的代碼也有相應(yīng)的注釋,應(yīng)該很容易理解。
static inline void uvToRGBuv(const uint8x16_t& u, const uint8x16_t& v, int32x4_t(&ruv)[4], int32x4_t(&guv)[4], int32x4_t(&buv)[4]) {uint8x16_t v128 = vdupq_n_u8((unsigned char)(128));int8x16_t su = vreinterpretq_s8_u8(vsubq_u8(u, v128));int8x16_t sv = vreinterpretq_s8_u8(vsubq_u8(v, v128));//將16對uv進行位擴展,u、v分別擴展到4個128bit neon寄存器,每個寄存器4個32bit數(shù)據(jù)int16x8_t uu0, uu1, vv0, vv1;v_expand_i8_16(su, uu0, uu1);v_expand_i8_16(sv, vv0, vv1);int32x4_t uu[4], vv[4];v_expand16_32(uu0, uu[0], uu[1]); v_expand16_32(uu1, uu[2], uu[3]);v_expand16_32(vv0, vv[0], vv[1]); v_expand16_32(vv1, vv[2], vv[3]);//相應(yīng)系數(shù)乘以2^20,每個數(shù)據(jù)占用32bitint32x4_t vshift = vdupq_n_s32(1 << (ITUR_BT_601_SHIFT - 1));int32x4_t vr = vdupq_n_s32(ITUR_BT_601_CVR);int32x4_t vg = vdupq_n_s32(ITUR_BT_601_CVG);int32x4_t ug = vdupq_n_s32(ITUR_BT_601_CUG);int32x4_t ub = vdupq_n_s32(ITUR_BT_601_CUB);//計算rgb中與uv相關(guān)的分量,共16組ruv、guv、buvfor (int k = 0; k < 4; k++){ruv[k] = vaddq_s32(vshift, vr * vv[k]);guv[k] = vaddq_s32(vshift, vaddq_s32(vg * vv[k], ug * uu[k]));buv[k] = vaddq_s32(vshift, ub * uu[k]);} }?
static inline void yRGBuvToRGBA(const uint8x16_t& vy,const int32x4_t(&ruv)[4],const int32x4_t(&guv)[4],const int32x4_t(&buv)[4],uint8x16_t& rr, uint8x16_t& gg, uint8x16_t& bb) {uint8x16_t v16 = vdupq_n_u8(16);uint8x16_t posY = vqsubq_u8(vy, v16); //飽和相減指令,<0的值等于0//y值擴展到32bit,與ruv、guv、buv相對應(yīng)uint16x8_t yy0, yy1;v_expand_u8_16(posY, yy0, yy1);int32x4_t yy[4];v_expand16_32(vreinterpretq_s16_u16(yy0), yy[0], yy[1]);v_expand16_32(vreinterpretq_s16_u16(yy1), yy[2], yy[3]);int32x4_t vcy = vdupq_n_s32(ITUR_BT_601_CY);int32x4_t y[4], r[4], g[4], b[4];for (int k = 0; k < 4; k++){y[k] = yy[k] * vcy;r[k] = vshrq_n_s32(vaddq_s32(y[k], ruv[k]), ITUR_BT_601_SHIFT);g[k] = vshrq_n_s32(vaddq_s32(y[k], guv[k]), ITUR_BT_601_SHIFT);b[k] = vshrq_n_s32(vaddq_s32(y[k], buv[k]), ITUR_BT_601_SHIFT);}//將r[0]-r[4]轉(zhuǎn)化為uint8合并到一個neon寄存器中,gb同理int16x8_t r0, r1, g0, g1, b0, b1;r0 = v_pack(r[0], r[1]);r1 = v_pack(r[2], r[3]);g0 = v_pack(g[0], g[1]);g1 = v_pack(g[2], g[3]);b0 = v_pack(b[0], b[1]);b1 = v_pack(b[2], b[3]);rr = v_pack_u(r0, r1);gg = v_pack_u(g0, g1);bb = v_pack_u(b0, b1); }熟悉neon指令的你對上面代碼很容易理解了,沒啥好說的。主要思想就是最大化利用neon寄存器實現(xiàn)并行操作,每次讀取64個y值,16對uv值,一次計算兩行分別32個像素值
總結(jié)
以上是生活随笔為你收集整理的YUV2RGB源码详解(参考Opencv4.1)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: yield的用法详解
- 下一篇: linux环境cpp/c文件的makef