trie 树
trie樹找出一堆字符串的前綴機器重復(fù)次數(shù)。
?
重點在insert, travel_node, 析構(gòu)函數(shù)
?
class EmailNameChar { //a-z A-Z 0-9 . - _ 為用戶名字符 //0-9, 用0-9表示 //a-z, A-Z用10-35表示, 不區(qū)分大小寫 //.36 -37 _38char _c; public:EmailNameChar(){_c = 39;}static bool index(char c, char &idx){if(c >= 'A' && c <= 'Z')idx = c - 'A' + 10;else if(c >= 'a' && c <= 'z')idx = c - 'a' + 10;else if(c >= 0 && c <= 9)idx = c;else if(c == '.')idx = 36;else if(c == '-')idx = 37;else if(c == '_')idx = 38;elsereturn false;return true;}static bool idxToc(char idx, char &c){if(idx <= 9)c = idx - 0 + '0';else if(idx <= 35)c = idx - 10 + 'a';else if(idx == 36)c = '.';else if(idx == 37)c = '-';else if(idx == 38)c = '_';elsereturn false;return true;}bool set(char c){if(index(c, _c)){return true;}else{return false;}}bool get(char &c) const{if(idxToc(_c, c))return true;return false;} };const static int num_chars = 39;struct PrefixInfo {int prefixlen;int matchtimes;string result; //prefixPrefixInfo(): prefixlen(0), matchtimes(0), result(), spamscore(-1) {}PrefixInfo(int dep, int c, const string &r): prefixlen(dep), matchtimes(c), result(r),spamscore(-1) {}double spamicity(int tocount, unsigned int maxtolen){if(spamscore < 0){if(tocount < PrefixInfo::MINTOCOUNT)spamscore = 0;else if(matchtimes >= PrefixInfo::MINREPEAT && prefixlen >= PrefixInfo::MINPREFIXLEN)spamscore = 1;else spamscore = 1 - (1 - matchtimes / (tocount + 0.0)) * (1 - prefixlen / (maxtolen + 0.0));}return spamscore;}const static int MINTOCOUNT = 4;const static int MINPREFIXLEN = 5;const static int MINREPEAT = 5; private:double spamscore; };struct PrefixInfoFilter {virtual bool operator()(const PrefixInfo & l) const{return true;}virtual ~PrefixInfoFilter(){} };struct PrefixInfoCountFilter: public PrefixInfoFilter {bool operator()(const PrefixInfo & l) const{if(l.matchtimes <= 2)return false;if(l.prefixlen <= 3)return false;return true;} };class Trie { protected:struct Trie_node{ int count;Trie_node *branch[num_chars];Trie_node(){ count = 1;for(int i = 0; i < num_chars; i++)branch[i] = 0;} };void delete_node(Trie_node * node){if(node == 0)return;for(int i = 0; i < num_chars; i++){delete_node(node->branch[i]);}delete node;}public:Trie():root(0), tocount(0), maxtolength(0){}~Trie(){delete_node(root); }//-1 failure//1 uniq wordint insert(const string &to){int np;if((np = to.find('@')) == string::npos){cerr<<"invalid to address: " << to << endl;return -1; //invalid to address}string name = to.substr(0, np);const char *word = name.c_str();int result = 1, position = 0;if(root == 0)root = new Trie_node;char char_code;Trie_node *location = root;while(location != 0 && *word != 0){if(EmailNameChar::index(*word, char_code) == false){cerr<<"invalid char: " << int(*word) <<". in to address: " << to<< endl;return -1;}if(location->branch[char_code] == 0)location->branch[char_code] = new Trie_node;elselocation->branch[char_code]->count++;location = location->branch[char_code];position++;word++;}if(to.length() > maxtolength)maxtolength = to.length();++tocount;return result;}int travel(vector<PrefixInfo> &vpi, const PrefixInfoFilter& f) const{if(root == 0)return -1;char result[maxtolen];for(int i = 0; i < num_chars; i++)travel_node(vpi, i, root->branch[i], 0, f, result);return tocount;}int gettocount() const{return tocount;}int getmaxtolength() const{return maxtolength;} private:void travel_node(vector<PrefixInfo> &vpi, int idx, const Trie_node * const node, int depth, const PrefixInfoFilter& f, char *result = 0) const{if(node == 0)return;if(depth >= maxtolen - 1)return;if(result){char c;if(EmailNameChar::idxToc(idx, c) == false){assert(0);}result[depth] = c, result[depth+1] = 0;}++depth;bool isPrefix = true;bool isTail = true;for(int i = 0; i < num_chars; i++){if(node->branch[i]){isTail = false;if(node->count <= node->branch[i]->count){isPrefix = false;break;}}}PrefixInfo curnode(depth, node->count, result);if(!isTail && isPrefix && f(curnode))vpi.push_back(curnode);for(int i = 0; i < num_chars; i++){travel_node(vpi, i, node->branch[i], depth, f, result);}}protected:Trie_node *root; //根int tocount; //樹中已有用戶個數(shù)unsigned int maxtolength; //樹中已知用戶名的最大長度const static int maxtolen = 200; //一個用戶名的最大長度 };
總結(jié)
- 上一篇: weka: naive bayes
- 下一篇: 特征选择---文本分类:叉方统计量