當前位置：首頁 > 运维知识 > windows >内容正文

windows

不依赖任何系统API，用c语言实现gbk/utf8/unicode编码转换

發布時間：2024/1/23 windows 36 豆豆

生活随笔收集整理的這篇文章主要介紹了不依赖任何系统API，用c语言实现gbk/utf8/unicode编码转换小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

轉載地址:https://blog.csdn.net/bladeandmaster88/article/details/54837338

漢字'我'

Unicode編碼是0x6211? ? ? ?01100010 00010001

UTF8編碼是? ? 0xe68891? ?11100110? 10001000 100010001

Unicode符號范圍

(十六進制)

UTF-8編碼方式

(二進制)

0x00 - 0x7F

0x80 - 0x7FF

0x800 - 0xFFFF

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ??0zzzzzzz

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 110yyyyy?10zzzzzz

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?1110xxxx?10yyyyyy? 10zzzzzz

0x10000 - 0x1FFFFF

0x200000 - 0x3FFFFFF

0x4000000 - 0x7FFFFFFF

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

? ? ? ? ? ? ? ?111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

一、Unicode轉utf8

‘我’的unicode編碼0x6211,二進制為: 01000010 00010001

將二進制形式分割成3段為0110 001000010001(分別是高4位、中間的6位、最后的低6位)

unicode轉utf8只需要這3段分別填入1110xxxx 10yyyyyy 10zzzzzz中的xxxx yyyyyy zzzzzz

得utf8編碼是0xe6889,二進制為: 11100110 10001000 100010001

int UnicodeToUtf8(char *pInput, char *pOutput) {int len = 0; //記錄轉換后的utf8字符串的字節數while (*pInput) {//處理一個unicode字符char low = *pInput; //取出unicode字符的低8位pInput++;char high = *pInput; //取出unicode字符的高8位int w=high<<8;unsigned wchar = (high<<8)+low; //高8位和低8位組成一個unicode字符，加法運算級別高if (wchar <= 0x7F) //英文字符{pOutput[len] = (char)wchar; //取wchar的低8位len++;} else if (wchar >= 0x80 && wchar <= 0x7FF) //可以轉換成雙字節pOutput字符{pOutput[len] = 0xc0 | ((wchar >> 6)&0x1f); //取出unicode編碼低6位后的5位，填充到110yyyyy 10zzzzzz 的yyyyy中len++; pOutput[len] = 0x80 | (wchar & 0x3f); //取出unicode編碼的低6位，填充到110yyyyy 10zzzzzz 的zzzzzz中len++;} else if (wchar >= 0x800 && wchar < 0xFFFF) //可以轉換成3個字節的pOutput字符{pOutput[len] = 0xe0 | ((wchar >> 12)&0x0f)J; //高四位填入1110xxxx 10yyyyyy 10zzzzzz中的xxxxlen++;pOutput[len] = 0x80 | ((wchar >> 6) & 0x3f); //中間6位填入1110xxxx 10yyyyyy 10zzzzzz中的yyyyyylen++;pOutput[len] = 0x80 | (wchar & 0x3f); //低6位填入1110xxxx 10yyyyyy 10zzzzzz中的zzzzzzlen++;}else //對于其他字節數的unicode字符不進行處理{return -1;}pInput++;//處理下一個unicode字符}//utf8字符串后面，有個\0pOutput[len] = 0;return len; }

二、utf8轉unicode

? utf8二進制形式為1110xxxx 10yyyyyy 10zzzzzz

'我'的utf8編碼0xe6889,二進制為:11100110 10001000 100010001

分別提取里面的xxxx yyyyyy zzzzzz,然后組合成xxxxyyyy yyzzzzzz,

xxxxyyyy就是unicode的高8位，yyzzzzzz就是unicode的低8位

/* * 將utf8編碼轉換成Unicode (UCS-2LE) 編碼低地址存低位字節 * 參數: * char *pInput 輸入字符串 * char *pOutput 輸出字符串 * 返回值: 轉換后的Unicode字符串的字節數，如果出錯則返回-1 */ //utf8轉unicode int Utf8ToUnicode(char *pInput, char *pOutput) {int outputSize = 0; //記錄轉換后的Unicode字符串的字節數while(*pInput) {if (*pInput > 0x00 && *pInput <= 0x7F) //處理單字節UTF8字符(英文字母、數字){*pOutput = *pInput;pOutput++;*pOutput = 0; //小端法表示，在高地址填補0} else if (((*pInput) & 0xE0) == 0xC0) //處理雙字節UTF8字節{char high = *pInput;pInput++;char middle = *pInput;pInput++;char low = *pInput;if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80)) {return -1;}*pOutput = (middle << 6) + (low & 0x3F); //取出middle的低兩位與low的低6位，組合成unicode字符的低8位pOutput++;*pOutput = (high << 4) + (middle >> 2) & 0x0F); //取出high的低四位與middle的中間四位，組合成unicode字符的高8位} else //對于其他字節數的UTF8字符不進行處理{return -1;}pInput ++; //處理下一個utf8字符pOutput++;outputSize += 2; }//unicode字符串后面，有兩個\0 *pOutput = 0; pOutput++; *pOutput = 0; return outputSize; //一個調用示例 int main(int argc, char **argv) {//漢字'我'的UTF8編碼是0xe68891,Unicode的編碼是0x6211//1.unicode轉utf8char unicodeStr[3] = {0x11, 0x62, 0x00}; //'我'的unicode是0x6211,按低地址存低位字節char *utf8Str = new char[5];memset(utf8Str, 0, 5);int num = UnicodeToUtf8(unicodeStr, utf8Str);unsigned char *p = (unsigned char *)utf8Str;for (int i=0; i<num; i++) {printf("%0x", *p);p++;}//輸出e68891printf("\n");delete utf8Str;//2.utf8轉unicode//char utf8Str[4] = {0xe6, 0x88, 0x91, 0x00};//char *unicodeStr = new char[8];//memset(unicodeStr, 0, 8);//int num = Utf8ToUnicode(utf8Str, unicodeStr);//if(num == -1) {// printf("Error!\n");//}//else//{// unsigned char *p = (unsigned char *)unicodeStr;// for (int i=0; i<num; i++) {// printf("%0x", *p);// p++;// }//輸出1162// printf("\n");//}//delete unicodeStr;return 0; }

三、gbk與unicode互轉

代碼下載地址：c語言利用編碼轉換表實現gbk與unicode互轉

參照博客：

http://blog.csdn.net/tge7618291/article/details/7599902
http://www.ithao123.cn/content-1832906.html

總結

以上是生活随笔為你收集整理的不依赖任何系统API，用c语言实现gbk/utf8/unicode编码转换的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： [编程笔记] UNICODE和UTF-8
下一篇： C/C++中ASCII与Unicode字