hash编码
常用的字符串Hash函數還有ELFHash,APHash等等,都是十分簡單有效的方法。這些函數使用位運算使得每一個字符都對最后的函數值產生影響。另外還有以MD5和SHA1為代表的雜湊函數,這些函數幾乎不可能找到碰撞。
常用字符串哈希函數有 BKDRHash,APHash,DJBHash,JSHash,RSHash,SDBMHash,PJWHash,ELFHash等等。對于以上幾種哈希函數,我對其進行了一個小小的評測。
| Hash函數 | 數據1 | 數據2 | 數據3 | 數據4 | 數據1得分 | 數據2得分 | 數據3得分 | 數據4得分 | 平均分 |
| BKDRHash | 2 | 0 | 4774 | 481 | 96.55 | 100 | 90.95 | 82.05 | 92.64 |
| APHash | 2 | 3 | 4754 | 493 | 96.55 | 88.46 | 100 | 51.28 | 86.28 |
| DJBHash | 2 | 2 | 4975 | 474 | 96.55 | 92.31 | 0 | 100 | 83.43 |
| JSHash | 1 | 4 | 4761 | 506 | 100 | 84.62 | 96.83 | 17.95 | 81.94 |
| RSHash | 1 | 0 | 4861 | 505 | 100 | 100 | 51.58 | 20.51 | 75.96 |
| SDBMHash | 3 | 2 | 4849 | 504 | 93.1 | 92.31 | 57.01 | 23.08 | 72.41 |
| PJWHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
| ELFHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
其中數據1為100000個字母和數字組成的隨機串哈希沖突個數。數據2為100000個有意義的英文句子哈希沖突個數。數據3為數據1的哈希值與 1000003(大素數)求模后存儲到線性表中沖突的個數。數據4為數據1的哈希值與10000019(更大素數)求模后存儲到線性表中沖突的個數。
經過比較,得出以上平均得分。平均數為平方平均數??梢园l現,BKDRHash無論是在實際效果還是編碼實現中,效果都是最突出的。APHash也是較為優秀的算法。DJBHash,JSHash,RSHash與SDBMHash各有千秋。PJWHash與ELFHash效果最差,但得分相似,其算法本質是相似的。
HashFun.h
| unsigned?int?SDBMHash(char?*str) { ????unsigned?int?hash?=?0; ????while?(*str) ????{ ????????//?equivalent?to:?hash?=?65599*hash?+?(*str++); ????????hash?=?(*str++)?+?(hash?<<?6)?+?(hash?<<?16)?-?hash; ????} ????return?(hash?&?0x7FFFFFFF); } //?RS?Hash?Function unsigned?int?RSHash(char?*str) { ????unsigned?int?b?=?378551; ????unsigned?int?a?=?63689; ????unsigned?int?hash?=?0; ????while?(*str) ????{ ????????hash?=?hash?*?a?+?(*str++); ????????a?*=?b; ????} ????return?(hash?&?0x7FFFFFFF); } //?JS?Hash?Function unsigned?int?JSHash(char?*str) { ????unsigned?int?hash?=?1315423911; ????while?(*str) ????{ ????????hash?^=?((hash?<<?5)?+?(*str++)?+?(hash?>>?2)); ????} ????return?(hash?&?0x7FFFFFFF); } //?P.?J.?Weinberger?Hash?Function unsigned?int?PJWHash(char?*str) { ????unsigned?int?BitsInUnignedInt?=?(unsigned?int)(sizeof(unsigned?int)?*?8); ????unsigned?int?ThreeQuarters????=?(unsigned?int)((BitsInUnignedInt??*?3)?/?4); ????unsigned?int?OneEighth????????=?(unsigned?int)(BitsInUnignedInt?/?8); ????unsigned?int?HighBits?????????=?(unsigned?int)(0xFFFFFFFF)?<<?(BitsInUnignedInt?-?OneEighth); ????unsigned?int?hash?????????????=?0; ????unsigned?int?test?????????????=?0; ????while?(*str) ????{ ????????hash?=?(hash?<<?OneEighth)?+?(*str++); ????????if?((test?=?hash?&?HighBits)?!=?0) ????????{ ????????????hash?=?((hash?^?(test?>>?ThreeQuarters))?&?(~HighBits)); ????????} ????} ????return?(hash?&?0x7FFFFFFF); } //?ELF?Hash?Function unsigned?int?ELFHash(char?*str) { ????unsigned?int?hash?=?0; ????unsigned?int?x????=?0; ????while?(*str) ????{ ????????hash?=?(hash?<<?4)?+?(*str++); ????????if?((x?=?hash?&?0xF0000000L)?!=?0) ????????{ ????????????hash?^=?(x?>>?24); ????????????hash?&=?~x; ????????} ????} ????return?(hash?&?0x7FFFFFFF); } //?BKDR?Hash?Function unsigned?int?BKDRHash(char?*str) { ????unsigned?int?seed?=?131;?//?31?131?1313?13131?131313?etc.. ????unsigned?int?hash?=?0; ????while?(*str) ????{ ????????hash?=?hash?*?seed?+?(*str++); ????} ????return?(hash?&?0x7FFFFFFF); } //?DJB?Hash?Function unsigned?int?DJBHash(char?*str) { ????unsigned?int?hash?=?5381; ????while?(*str) ????{ ????????hash?+=?(hash?<<?5)?+?(*str++); ????} ????return?(hash?&?0x7FFFFFFF); } //?AP?Hash?Function unsigned?int?APHash(char?*str) { ????unsigned?int?hash?=?0; ????int?i; ????for?(i=0;?*str;?i++) ????{ ????????if?((i?&?1)?==?0) ????????{ ????????????hash?^=?((hash?<<?7)?^?(*str++)?^?(hash?>>?3)); ????????} ????????else ????????{ ????????????hash?^=?(~((hash?<<?11)?^?(*str++)?^?(hash?>>?5))); ????????} ????} ????return?(hash?&?0x7FFFFFFF); } |
以上為轉載內容 以下是自己測試Hash算法效果的函數。
main.cpp
| #include<iostream> #include<fstream> #include<cstdlib> #include<cstring> #include<ctime> #include"HashFun.h" using?namespace?std; /* ??從Dict.txt讀入英文單詞表,總共24018個單詞 ??測試哈希函數的分布情況 */ void?ReadDict(char?**?dict,int&?len) { ??fstream?dictFile; ??dictFile.open("Dict.txt",ios::in); ??if(!dictFile)??{??cout<<"OpenFailed."<<endl;??exit(-1);} ??char?tmpString[50]; ??len=0; ??int?sl; ??while(!dictFile.eof()) ??{ ????dictFile.getline(tmpString,50); ????sl=strlen(tmpString); ????if(sl>0?&&?tmpString[sl-1]=='\r') ????????????{ ????????????????tmpString[sl-1]='\0'; ????????????????--sl; ????????????} ????if(sl>0) ????{ ????????dict[len]=new?char[sl]; ????????????if(dict[len]==NULL)?{???cout<<"Memory?fail."<<endl;exit(-1);} ????????strcpy(dict[len],tmpString); ????????//cout<<dict[len]<<endl; ????????len++; ????} ??} ??dictFile.close(); } const?int?MaxHashLen=49999; void?Analise(unsigned?int?HashHeight[]) { ????const?unsigned?int?AreaNum=20; ????unsigned?int?AreaHeight[AreaNum]; ????const?unsigned?int?MaxRecordHeight=10;???????//記錄各自出現高度超過10的放在一起統計 ????unsigned??int?NumOfHeight[MaxRecordHeight+1]; ????memset(AreaHeight,0,sizeof(AreaHeight)); ????memset(NumOfHeight,0,?sizeof(NumOfHeight)); ????unsigned?int?step?=?MaxHashLen/AreaNum+1,wordNum=0,MaxHeight=0; ????for(int?i=0;i<MaxHashLen;i++) ????{ ????????wordNum+=HashHeight[i]; ????????if(?HashHeight[i]>=MaxRecordHeight?) ????????????NumOfHeight[MaxRecordHeight]++; ????????else ????????????NumOfHeight[HashHeight[i]]++; ????????if(HashHeight[i]>MaxHeight) ????????????MaxHeight=HashHeight[i]; ????????AreaHeight[i/step]+=HashHeight[i]; ????} ????int?searchtime=0; ????for(unsigned?int?i=1;i<MaxRecordHeight;i++) ????????searchtime+=(i+1)*i*NumOfHeight[i];?//有i*NumOfHeight[i]個的單詞在高度為i的鏈表里 ????searchtime+=((MaxRecordHeight+MaxHeight)/2+1)*(MaxRecordHeight+MaxHeight)/2*NumOfHeight[MaxRecordHeight]; ????searchtime/=2; ????cout<<"--------Hash分布如下"<<endl; ????for(unsigned?int?i=0;i<AreaNum;i++) ????????cout<<"[?"<<i*step<<"?,?"<<(i+1)*?step?<<"?]:?"<<AreaHeight[i]<<endl; ????cout<<"--------鏈表高度情況"<<endl; ????for(unsigned?int?i=0;i<MaxRecordHeight;i++) ????????cout<<"高度為:?"<<i<<?"的鏈表個數為?"?<<NumOfHeight[i]<<endl; ????cout<<"高度大于?"<<MaxRecordHeight<<?"的鏈表個數為?"?<<NumOfHeight[MaxRecordHeight]<<endl; ????cout<<"--------鏈表最高高度:"<<MaxHeight<<endl; ????cout<<"--------單詞表個數為:"<<wordNum<<endl; ????cout<<"--------平均鏈表高度:"<<float(wordNum)/(MaxHashLen-NumOfHeight[0])<<endl; ????cout<<"--------鏈表的利用率:"<<1-float(NumOfHeight[0])/MaxHashLen<<endl; ????cout<<"--------平均查找次數:"<<1.0*searchtime/wordNum<<endl; } int?main() { ????char?*dict[24100];????//存儲單詞表 ??int?wordNum; ??unsigned?int?HashHeight[MaxHashLen],hashvalue; ??memset(HashHeight,0,sizeof(HashHeight)); ??ReadDict(dict,wordNum); ????unsigned?int?startTime,curTime; ????startTime=clock(); ????for(int?i=0;i<wordNum;i++) ????{ ????????hashvalue?=?SDBMHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?RSHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?JSHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?PJWHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?ELFHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?BKDRHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?DJBHash(?dict[i]?)?%?MaxHashLen; //????????hashvalue?=?APHash(?dict[i]?)?%?MaxHashLen; ????????HashHeight[?hashvalue?]++; ????????//cout<<hashvalue<<endl; ????} ????curTime=clock(); ????Analise(HashHeight); ????cout<<"cost?time:"<<(curTime-startTime)/1000?<<"ms"<<endl; ????//for(int?i=0;i<MaxHashLen;i++) ??????//??cout<<HashHeight[i]<<endl; ??return?0; } |
我的測試說明
測試的內容,是從英漢詞典Dict.txt讀取24018個單詞,將哈希值對49999求余,觀察Hash算法的散列情況
英漢詞典Dict.txt在這里下載:http://wenku.baidu.com/view/4ff90a9851e79b8968022695.html
測試結果如下:
| 方法 | 最高高度 | 平均查找次數 |
| BKDRHash | 6 | 1.2383 |
| SDBMHash | 5 | 1.2387 |
| JSHash | 6 | 1.2397 |
| DJBHash | 5 | 1.2403 |
| RSHash | 5 | 1.2417 |
| PJWHash | 5 | 1.2427 |
| ELFHash | 5 | 1.2427 |
| APHash | 6 | 1.2436 |
轉自:
http://apps.hi.baidu.com/share/detail/39630021
http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html
轉載于:https://www.cnblogs.com/dongtian0359/archive/2011/07/10/2102192.html
總結
- 上一篇: Java2实用教程(第二版)程序代码——
- 下一篇: 如何备份linux系统(转)