禁用字检测
utf8編碼的數據可直接使用下面的代碼
最關鍵的步驟就是把字符串拆成單個字,UTF-8編碼的字,如果只有一個字節則其最高二進制位為0;如果是多字節,其第一個字節從最高位開始,連續的二進制位值為1的個數決定了其編碼的位數,其余各字節均以10開頭。
UTF-8最多可用到6個字節。?
1字節 0xxxxxxx?
2字節 110xxxxx 10xxxxxx?
3字節 1110xxxx 10xxxxxx 10xxxxxx?
4字節 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx?
5字節 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx?
6字節 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
其它就很簡單了
1、禁用字處理 ? 禁用字拆分后以第一個為key保存
2、待測試字符串
? ?a)、拆分成單字
? ?b)、大寫轉小寫,字母和空格全角轉半角,去掉多余空格(英文字母后最多只會有一個空格,中文后不應該有空格)
? ?c)、遍歷字符串的所有字 檢測每個字對應的禁用字組是否在待測字符串中?
#include?<string> #include?<vector> #include?<map> #include?<set> #include?<iostream> #include?<sstream> #include?<string.h> #include?<stdio.h>class?CDisableWord { struct?SDisableWord {std::string str; }; typedef?std::vector<SDisableWord>? ?VDW; private:std::map<std::string,?VDW> m_mapDisableWord;std::set<std::string> m_setAllDisableWord;//?特殊轉換?std::map<std::string,?std::string>??m_mapSpecialWord; private://?把字符串拆分為單個字size_t?SplitWord(const?char*?pSrc,?unsigned?int?len,?std::vector<std::string>&?output);//?獲取特殊字對應的轉換字const?std::string*?GetSpecialWord(const?std::string&?str); public:CDisableWord();//?設置禁用字void?AddOneDisableWord(const?std::string&?str);//?檢測bool?CheckStr(const?char*?pSrc,?unsigned?int?len);bool?CheckStr(const?std::string&?str); };CDisableWord::CDisableWord() {std::string?qjdx[26]?=?{"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};std::string?qjxx[26]?=?{"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"};std::string?dx?=?"ABCDEFGHIJKLMNOPQRSTUVWXYZ";std::string?rst?=?"abcdefghijklmnopqrstuvwxyz";std::string?str1?=?"a";std::string?str2?=?"a";for(int?i?=?0;?i?<?26;?i++){str1[0]?=?rst[i];str2[0]?=?dx[i];m_mapSpecialWord[qjdx[i]]?=?str1;m_mapSpecialWord[qjxx[i]]?=?str1;m_mapSpecialWord[str2]?=?str1;}m_mapSpecialWord[" "]?=?std::string("?"); }//?把字符串拆分為單個字 size_t?CDisableWord::SplitWord(const?char*?pSrc,?unsigned?int?len,?std::vector<std::string>&?output) {std::string?ch;unsigned?char?byte;for(unsigned?int?i?=?0,?wlen?=?0;?i?<?len;?i?+=?wlen){byte?=?(unsigned?char)pSrc[i];if?(byte?>=?0xFC)wlen?=?6;??else?if?(byte?>=?0xF8)wlen?=?5;else?if?(byte?>=?0xF0)wlen?=?4;else?if?(byte?>=?0xE0)wlen?=?3;else?if?(byte?>=?0xC0)wlen?=?2;elsewlen?=?1;if(i?+?wlen?>?len)break;ch.clear();for(unsigned?int?j?=?0;?j?<?wlen;?j++)ch?+=?pSrc[i+j];output.push_back(ch);}return?output.size(); }//?獲取特殊字對應的轉換字 const?std::string*?CDisableWord::GetSpecialWord(const?std::string&?str) {std::map<std::string,?std::string>::iterator?miter?=?m_mapSpecialWord.find(str);if(miter?==?m_mapSpecialWord.end())return?NULL;return?&(miter->second); }void?CDisableWord::AddOneDisableWord(const?std::string&?str) {if(m_setAllDisableWord.find(str)?!=?m_setAllDisableWord.end())return;std::vector<std::string>?output;if(SplitWord(str.c_str(),?str.size(),?output)?==?0?||?output[0].size()?==?0)return;std::map<std::string,?VDW>::iterator?miter?=?m_mapDisableWord.find(output[0]);if(miter?==?m_mapDisableWord.end()){m_mapDisableWord[output[0]]?=?VDW();miter?=?m_mapDisableWord.find(output[0]);}if(miter?==?m_mapDisableWord.end())return;SDisableWord?sdw;sdw.str?=?str;miter->second.push_back(sdw); }bool?CDisableWord::CheckStr(const?char*?pSrc,?unsigned?int?len) {if(len?==?0)return?true;std::string?str(pSrc,?len);return?CheckStr(str); }bool?CDisableWord::CheckStr(const?std::string&?str) {if(str.size()?==?0)return?true;std::vector<std::string>?output;if(SplitWord(str.c_str(),?str.size(),?output)?==?0?||?output[0].size()?==?0)return?false;//?大寫轉小寫??全角轉半角for(size_t?i?=?0;?i?<?output.size();?++i){const?std::string*?pStr?=?GetSpecialWord(output[i]);if(pStr)output[i]?=?*pStr;}std::string?StrSrc?=?"";????????//轉換之后的字符串std::string?StrDelSpace?=?""; //刪除非英文之后的所有空格?所有大寫轉成小寫std::set<std::string>?sonly;for(size_t?i?=?0;?i?<?output.size();?++i){sonly.insert(output[i]);StrSrc?+=?output[i];bool?bnoadd?=?false;if(i?>?0?&&?output[i]?==?"?"){bnoadd?=?true;for(int?j?=?int(i?-?1);?j?>=?0;?--j){if(output[j]?==?"?")continue;if(output[j].size()?>?1)bnoadd?=?false;else?if(j?+?1?==?int(i))?//?英文字符留一個空格bnoadd?=?false;break;}}if(!bnoadd)StrDelSpace?+=?output[i];}bool?bSame?=?(StrDelSpace?==?StrSrc);std::set<std::string>::iterator?siter?=?sonly.begin();for(;?siter?!=?sonly.end();?++siter){std::map<std::string,?VDW>::iterator?miter?=?m_mapDisableWord.find(*siter);if(miter?==?m_mapDisableWord.end())continue;for(size_t?j?=?0;?j?<?miter->second.size();?++j){SDisableWord&?sdw?=?miter->second[j];if(StrSrc.find(sdw.str)?!=?std::string::npos)return?false;else?if(!bSame?&&?StrDelSpace.find(sdw.str)?!=?std::string::npos)return?false;}}return?true; }int?main() {CDisableWord?cdw;//?設置禁用字std::string?strdw[]?=?{"中文",?"英文",?"測試",?"aabb",?"測?試",?"cc?dd"};for(int?i?=?0;?i?<?6;?i++)cdw.AddOneDisableWord(strdw[i]);while(1){char?s[51];std::cin.getline(s,50);if(cdw.CheckStr(s,?strlen(s)))printf("收到:%s??沒有敏感字\n",?s);elseprintf("收到:%s??敏感字?敏感字?敏感字\n",?s);}return?0; }//?g++?-g?-o?DisableWord?DisableWord.cpp轉載于:https://blog.51cto.com/13611395/2301670
總結
- 上一篇: 大数据之Linux早课9.21
- 下一篇: 9月份个人:windows系统的DNS服