UTF-8, Unicode, GB2312格式串转换之C语言版
原住址:http://www.cnitblog.com/wujian-IT/archive/2007/12/13/37671.html ? ?
??????/*??????author:???wu.jian???(吳劍)??????English name: Sword
??????/*??????date:??????2007-12-13
??????/*??????purpose:???知識共享
??????這幾天工作上碰到了UTF-8轉GB2312的問題,而且是在嵌入式的環境下,沒有API可用,查了很多網上的資料,大多調用VC或者Linux下自帶的接口。在這里我將這兩天的工作做個總結。
??????總的來說分為兩大步(這里就不介紹基礎知識了):
??????一、UTF8 -> Unicode
??????由于UTF8和Unicode存在著聯系,所以不需要任何庫就可以直接進行轉換。首先要看懂UTF8的編碼格式:
??????U-00000000?-?U-0000007F:?0xxxxxxx??
??????U-00000080?-?U-000007FF:?110xxxxx?10xxxxxx??
??????U-00000800?-?U-0000FFFF:?1110xxxx?10xxxxxx?10xxxxxx??
??????U-00010000?-?U-001FFFFF:?11110xxx?10xxxxxx?10xxxxxx?10xxxxxx??
??????U-00200000?-?U-03FFFFFF:?111110xx?10xxxxxx?10xxxxxx?10xxxxxx?10xxxxxx??
??????U-04000000?-?U-7FFFFFFF:?1111110x?10xxxxxx?10xxxxxx?10xxxxxx?10xxxxxx?10xxxxxx??
??????前面幾個1就代表后面幾個字節是屬于一起的。如果要解析一長串UTF8格式的字符串,這點就很有用了。下面這個函數就是判斷前面幾個1的(這里有define APP_PRINT printf,這樣當release的時候將這個宏定義為空就行了,不需要一個一個去改,又方便重新調試):
??????int GetUtf8ByteNumForWord(u8 firstCh)? //判斷前面幾個1
??????{
??????????u8 temp = 0x80;
??????????int num = 0;
?
??????????while (temp & firstCh)
??????????{
????????????????num++;
?????????????????temp = (temp >> 1);
???????????}
???????APP_PRINT("the num is: %d", num);
????????return num;
??????}
??????利用這個函數可以得到字符串中那幾個字節是一起的。因為UTF8最大只有6個字節,所以就根據返回值來處理這里我只處理了3個字節和1個字節的UTF8的編碼,因為一般來說中文在UTF8中是3個字節。
//將len個字節的UTF8格式的轉換成GB2312格式存放在temp預先申請好的緩沖區中
void Utf8ToGb2312(const char* utf8, int len, char *temp)
{
???????APP_PRINT("utf8->unicode: \n");
???????APP_PRINT("utf8: [");
???????for (int k = 0; k < len; k++)
???????{
??????????????APP_PRINT("%02x ", utf8[k]);
???????}
???????APP_PRINT("]\n");
?
???????int byteCount = 0;
???????int i = 0;
???????int j = 0;
???????u16 unicodeKey = 0;
???????u16 gbKey = 0;
??????//循環解析
???????while (i < len)
?????? {???
????????switch(GetUtf8ByteNumForWord((u8)utf8[i]))
????????{
??????????case 0:
????????????????temp[j] = utf8[i];
????????????????byteCount = 1;
??????????break;
????????? case 2:
??????????temp[j] = utf8[i];
??????????temp[j + 1] = utf8[i + 1];
??????????byteCount = 2;
??????????break;
?????????case 3:
?????????????????//這里就開始進行UTF8->Unicode
?????????????????temp[j + 1] = ((utf8[i] & 0x0F) << 4) | ((utf8[i + 1] >> 2) & 0x0F);
?????????????????temp[j] = ((utf8[i + 1] & 0x03) << 6) + (utf8[i + 2] & 0x3F);
??????????????? //取得Unicode的值
?????????????????memcpy(&unicodeKey, (temp + j), 2);
?????????????????APP_PRINT("unicode key is: 0x%04X\n", unicodeKey);
??????????????????//根據這個值查表取得對應的GB2312的值
????????????????gbKey = SearchCodeTable(unicodeKey);
????????????????APP_PRINT("gb2312 key is: 0x%04X\n", gbKey);
????
????????????????if (gbKey != 0)
????????????????{
???????????????????????//here change the byte
????????????????????????//不為0表示搜索到,將高低兩個字節調換調成我要的形式
???????????????????????gbKey = (gbKey >> 8) | (gbKey << 8);
???????????????????????APP_PRINT("after changing, gb2312 key is: 0x%04X\n", gbKey);
???????????????????????memcpy((temp + j), &gbKey, 2);
??????????????????}
????????????????byteCount = 3;
??????????break;
??????????case 4:
??????????byteCount = 4;
??????????break;
?????????case 5:
??????????byteCount = 5;
??????????break;
?????????case 6:
??????????byteCount = 6;
??????????break;
????
?????????default:
??????????APP_PRINT("the len is more than 6\n");
??????????break;????
????????}
????????i += byteCount;
????????if (byteCount == 1)
????????{
???????????????j++;
????????}
????????else
????????{
???????????????j += 2;
????????}
??
???????}
???????APP_PRINT("utf8: [");
???????for (k = 0; k < j; k++)
???????{
??????????????APP_PRINT("%02x ", temp[k]);
???????}
???????APP_PRINT("]\n");
}
??????二、下面主要談談利用查表法來進行Unicode->GB2312的轉換,首先下載碼表,一般碼表都是將GB2312的放在前面,Unicode放在后面,這樣對于我們來說不方便使用,所以我轉換了下,將Unicode放在前面,而且按照從小到大排好序。(這里只需要考慮都為兩個字節的情況,因為前面的UTF8->Unicode并沒有將單字節的ASCII轉換成Unicode)
????????????(1)做表:(可以到這里下載:http://blog.91bs.com/?action=show&id=20,這里謝謝渣渣的豬窩)
????????????這個是原來的樣子:
????????????0x8140?0x4E02?#CJK UNIFIED IDEOGRAPH
????????????0x8141?0x4E04?#CJK UNIFIED IDEOGRAPH
????????????0x8142?0x4E05?#CJK UNIFIED IDEOGRAPH
????????????先弄成(這個可以寫個小程序來做,我就是在VC上做的,如果需要可以聯系我):
????????????{?0x4E02?,0x8140?}, //CJK UNIFIED IDEOGRAPH
????????????{?0x4E04?,0x8141?}, //CJK UNIFIED IDEOGRAPH
????????????{?0x4E05?,0x8142?}, //CJK UNIFIED IDEOGRAPH
????????????這樣就可以把這些放在.h文件中了,下面是我的定義:
????????????typedef struct unicode_gb
????????????{
???????????????????unsigned short?unicode;
???????????????????unsigned short gb;
????????????} UNICODE_GB;
????????????UNICODE_GB code_table[] =?
????????????{
??????????????????{?0x4E02, 0x8140?},?? //CJK UNIFIED IDEOGRAPH
??????????????????{?0x4E04, 0x8141?},? //CJK UNIFIED IDEOGRAPH
??????????????????{?0x4E05, 0x8142?},? //CJK UNIFIED IDEOGRAPH
??????????????????。。。。。。省略
????????????下面這一步也很簡單,在VC中用冒泡排序法,對這個數組按照unicode值進行排序,如果需要可以聯系我,把最終結果打印出來,在cmd下運行name > 1.txt就輸出到文件,這樣就有了一個按照unicode排好序的unicode->gb2312碼表。
???int main(int argc, char *argv[])
{
????int num = 0;
????UNICODE_GB temp;
????int i = 0;
????int j = 0;
????num = sizeof(code_table) / sizeof(UNICODE_GB);
????printf("struct size: %d | total size: %d | num is: %d \n",?
????sizeof(UNICODE_GB), sizeof(code_table), num);
????for (i = 0; i < num; i++)
????{
????????for (j = 1; j < num - i; j++)
????????{
????????????if (code_table[j - 1].unicode > code_table[j].unicode)
????????????{
????????????????temp.unicode = code_table[j - 1].unicode;
????????????????temp.gb = code_table[j - 1].gb;
????????????????code_table[j - 1].unicode = code_table[j].unicode;
????????????????code_table[j - 1].gb = code_table[j].gb;
????????????????code_table[j].unicode = temp.unicode;
????????????????code_table[j].gb = temp.gb;
????????????}
????????}
????}
????printf("here is the code table sorted by unicode\n\n");
????for (i = 0; i < num; i++)
????{
????????printf("{\t0x%04X,\t0x%04X\t},\t\n", code_table[i].unicode, code_table[i].gb);
?????}
???????printf("\n\n print over!\n");
???//以下注釋掉的其實就是我用來對原來的碼表添加,{,}等用的
???/*
????char buff[100];
????char buff_1[100];?
?
????FILE* fp = NULL;
????FILE *fp_1 = NULL;
????memset(buff, 0, 100);
????memset(buff_1, 0, 100);
?
????fp = fopen("table.txt", "rw");
????fp_1 = fopen("table_1.txt", "a+");
????if ((fp == NULL) || (fp_1 == NULL))
????{
????????printf("open file error!\n");
????????return 1;
????}
????while (fgets(buff, 100, fp) != NULL)
????{
????????buff[8] = ',';
????????fputs(buff, fp_1);
????}
?*/
????return 0;
}
??????最后就是搜索算法了,前面已經排好序了,現在我們把排好序的碼表放在我們真正需要的.h文件中。大家應該猜我用什么算法搜索了吧,二分法。
#define CODE_TABLE_SIZE 21791
//這個表是死的,所以就直接用宏表示長度,不用每次都用size,不過這樣可能對移植性不好。
u16 SearchCodeTable(u16 unicodeKey)
{
????int first = 0;
????int end = CODE_TABLE_SIZE - 1;
????int mid = 0;
????while (first <= end)
????{
????????mid = (first + end) / 2;
????????if (code_table[mid].unicode == unicodeKey)
????????{
????????????return code_table[mid].gb;
????????}
????????else if (code_table[mid].unicode > unicodeKey)
????????{
????????????end = mid - 1;
????????}
????????else?
????????{
????????????first = mid + 1;
????????}
????}
????return 0;
}
??????到此,已經能夠將UTF8串轉換成GB2312了。是一長串哦,而不是單個漢字的編碼轉換。
轉載于:https://www.cnblogs.com/alan666/p/8311940.html
總結
以上是生活随笔為你收集整理的UTF-8, Unicode, GB2312格式串转换之C语言版的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 2017/06/23 linu
- 下一篇: Linux时间子系统之(十二):peri