hash算法比较
常用的hash算法有BKDRHash,APHash,DJBHash,JSHash,RSHash,SDBMHash,PJWHash,ELFHash等。
C語言版
unsigned int SDBMHash(char *str) {unsigned int hash = 0;while (*str){// equivalent to: hash = 65599*hash + (*str++);hash = (*str++) + (hash << 6) + (hash << 16) - hash;}return (hash & 0x7FFFFFFF); }// RS Hash Function unsigned int RSHash(char *str) {unsigned int b = 378551;unsigned int a = 63689;unsigned int hash = 0;while (*str){hash = hash * a + (*str++);a *= b;}return (hash & 0x7FFFFFFF); }// JS Hash Function unsigned int JSHash(char *str) {unsigned int hash = 1315423911;while (*str){hash ^= ((hash << 5) + (*str++) + (hash >> 2));}return (hash & 0x7FFFFFFF); }// P. J. Weinberger Hash Function unsigned int PJWHash(char *str) {unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4);unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);unsigned int hash = 0;unsigned int test = 0;while (*str){hash = (hash << OneEighth) + (*str++);if ((test = hash & HighBits) != 0){hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));}}return (hash & 0x7FFFFFFF); }// ELF Hash Function unsigned int ELFHash(char *str) {unsigned int hash = 0;unsigned int x = 0;while (*str){hash = (hash << 4) + (*str++);if ((x = hash & 0xF0000000L) != 0){hash ^= (x >> 24);hash &= ~x;}}return (hash & 0x7FFFFFFF); }// BKDR Hash Function unsigned int BKDRHash(char *str) {unsigned int seed = 131; // 31 131 1313 13131 131313 etc..unsigned int hash = 0;while (*str){hash = hash * seed + (*str++);}return (hash & 0x7FFFFFFF); }// DJB Hash Function unsigned int DJBHash(char *str) {unsigned int hash = 5381;while (*str){hash += (hash << 5) + (*str++);}return (hash & 0x7FFFFFFF); }// AP Hash Function unsigned int APHash(char *str) {unsigned int hash = 0;int i;for (i=0; *str; i++){if ((i & 1) == 0){hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));}else{hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));}}return (hash & 0x7FFFFFFF); } //編程珠璣中的一個hash函數(shù) //用跟元素個數(shù)最接近的質(zhì)數(shù)作為散列表的大小 #define NHASH 29989 #define MULT 31unsigned in hash(char *p) {unsigned int h = 0;for (; *p; p++)h = MULT *h + *p;return h % NHASH; }C++版
使用了模板支持寬字符串,來源:http://blog.csdn.net/icefireelf/article/details/5796529
// BKDR Hash算法由于在Brian Kernighan與Dennis Ritchie的《The C Programming Language》一書被展示而得名,是一種簡單快捷的hash算法,也是Java目前采用的字符串的Hash算法(累乘因子為31)。 template<class T> size_t BKDRHash(const T *str) {register size_t hash = 0;while (size_t ch = (size_t)*str++){ hash = hash * 131 + ch; // 也可以乘以31、131、1313、13131、131313..// 有人說將乘法分解為位運(yùn)算及加減法可以提高效率,如將上式表達(dá)為:hash = (hash << 7) + (hash << 1) + hash + ch;// 但其實在Intel平臺上,CPU內(nèi)部對二者的處理效率都是差不多的,// 我分別進(jìn)行了100億次的上述兩種運(yùn)算,發(fā)現(xiàn)二者時間差距基本為0(如果是Debug版,分解成位運(yùn)算后的耗時還要高1/3);// 在ARM這類RISC系統(tǒng)上沒有測試過,由于ARM內(nèi)部使用Booth's Algorithm來模擬32位整數(shù)乘法運(yùn)算,它的效率與乘數(shù)有關(guān):// 當(dāng)乘數(shù)8-31位都為1或0時,需要1個時鐘周期// 當(dāng)乘數(shù)16-31位都為1或0時,需要2個時鐘周期// 當(dāng)乘數(shù)24-31位都為1或0時,需要3個時鐘周期// 否則,需要4個時鐘周期// 因此,雖然我沒有實際測試,但是我依然認(rèn)為二者效率上差別不大 }return hash; } // SDBM Hash算法是由于在開源項目SDBM(一種簡單的數(shù)據(jù)庫引擎)中被應(yīng)用而得名,它與BKDRHash思想一致,只是種子不同而已。 template<class T> size_t SDBMHash(const T *str) {register size_t hash = 0;while (size_t ch = (size_t)*str++){//hash = 65599 * hash + ch; hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;}return hash; } // RS Hash 因Robert Sedgwicks在其《Algorithms in C》一書中展示而得名。 template<class T> size_t RSHash(const T *str) {register size_t hash = 0;size_t magic = 63689; while (size_t ch = (size_t)*str++){hash = hash * magic + ch;magic *= 378551;}return hash; } // AP Hash 由Arash Partow發(fā)明的一種hash算法。 template<class T> size_t APHash(const T *str) {register size_t hash = 0;size_t ch;for (long i = 0; ch = (size_t)*str++; i++){if ((i & 1) == 0){hash ^= ((hash << 7) ^ ch ^ (hash >> 3));}else{hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));}}return hash; } // JS Hash 由Justin Sobel發(fā)明的一種hash算法。 template<class T> size_t JSHash(const T *str) {if(!*str) // 這是由本人添加,以保證空字符串返回哈希值0return 0;register size_t hash = 1315423911;while (size_t ch = (size_t)*str++){hash ^= ((hash << 5) + ch + (hash >> 2));}return hash; } // DEK hash算法是由于Donald E. Knuth在《Art Of Computer Programming Volume 3》中展示而得名。 template<class T> size_t DEKHash(const T* str) {if(!*str) // 這是由本人添加,以保證空字符串返回哈希值0return 0;register size_t hash = 1315423911;while (size_t ch = (size_t)*str++){hash = ((hash << 5) ^ (hash >> 27)) ^ ch;}return hash; } // FNV Hash Unix system系統(tǒng)中使用的一種著名hash算法,后來微軟也在其hash_map中實現(xiàn)。 template<class T> size_t FNVHash(const T* str) {if(!*str) // 這是由本人添加,以保證空字符串返回哈希值0return 0;register size_t hash = 2166136261;while (size_t ch = (size_t)*str++){hash *= 16777619;hash ^= ch;}return hash; } // DJB Hash 由Daniel J. Bernstein教授發(fā)明的一種hash算法。 template<class T> size_t DJBHash(const T *str) {if(!*str) // 這是由本人添加,以保證空字符串返回哈希值0return 0;register size_t hash = 5381;while (size_t ch = (size_t)*str++){hash += (hash << 5) + ch;}return hash; } // DJB2 Hash 由Daniel J. Bernstein 發(fā)明的另一種hash算法。 template<class T> size_t DJB2Hash(const T *str) {if(!*str) // 這是由本人添加,以保證空字符串返回哈希值0return 0;register size_t hash = 5381;while (size_t ch = (size_t)*str++){hash = hash * 33 ^ ch;}return hash; } // PJW Hash算法是基于AT&T貝爾實驗室的Peter J. Weinberger的論文而發(fā)明的一種hash算法。 template<class T> size_t PJWHash(const T *str) {static const size_t TotalBits = sizeof(size_t) * 8;static const size_t ThreeQuarters = (TotalBits * 3) / 4;static const size_t OneEighth = TotalBits / 8;static const size_t HighBits = ((size_t)-1) << (TotalBits - OneEighth); register size_t hash = 0;size_t magic = 0; while (size_t ch = (size_t)*str++){hash = (hash << OneEighth) + ch;if ((magic = hash & HighBits) != 0){hash = ((hash ^ (magic >> ThreeQuarters)) & (~HighBits));}}return hash; } // ELF Hash 由于在Unix的Extended Library Function被附帶而得名的一種hash算法,它其實就是PJW Hash的變形。 template<class T> size_t ELFHash(const T *str) {static const size_t TotalBits = sizeof(size_t) * 8;static const size_t ThreeQuarters = (TotalBits * 3) / 4;static const size_t OneEighth = TotalBits / 8;static const size_t HighBits = ((size_t)-1) << (TotalBits - OneEighth); register size_t hash = 0;size_t magic = 0;while (size_t ch = (size_t)*str++){hash = (hash << OneEighth) + ch;if ((magic = hash & HighBits) != 0){hash ^= (magic >> ThreeQuarters);hash &= ~magic;} }return hash; }一些測試數(shù)據(jù):
- http://blog.csdn.net/alburthoffman/article/details/19641123
使用網(wǎng)上提供的一份英語單詞文件:http://www.cs.duke.edu/~ola/ap/linuxwords,共45402個單詞,分別比較上面每一個算法在哈希表長度為100,1000和10000時的最大沖突數(shù),理論上平均為455,46和5。結(jié)果如下:
| bkdrhash | 509 | 72 | 14 |
| aphash | 519 | 72 | 15 |
| jshash | 494 | 66 | 15 |
| rshash | 505 | 74 | 15 |
| sdbmhash | 518 | 67 | 15 |
| pjwhash | 756 | 131 | 34 |
| elfhash | 801 | 158 | 91 |
| djbhash | 512 | 64 | 17 |
| dekhash | 536 | 75 | 22 |
| bphash | 1391 | 696 | 690 |
| fnvhash | 516 | 65 | 14 |
| javahash | 523 | 69 | 16 |
結(jié)論
從上面的統(tǒng)計數(shù)據(jù)可以看出對英文單詞集而言,jshash,djbhash和fnvhash都有很好地分散性。
- http://blog.csdn.net/icefireelf/article/details/5796529
測試1:對100000個由大小寫字母與數(shù)字隨機(jī)的ANSI字符串(無重復(fù),每個字符串最大長度不超過64字符)進(jìn)行散列:
| BKDRHash | 0 | 4826 |
| SDBMHash | 2 | 4814 |
| RSHash | 2 | 4886 |
| APHash | 0 | 4846 |
| ELFHash | 1515 | 6120 |
| JSHash | 779 | 5587 |
| DEKHash | 863 | 5643 |
| FNVHash | 2 | 4872 |
| DJBHash | 832 | 5645 |
| DJB2Hash | 695 | 5309 |
| PJWHash | 1515 | 6120 |
測試2:對100000個由任意UNICODE組成隨機(jī)字符串(無重復(fù),每個字符串最大長度不超過64字符)進(jìn)行散列:
| BKDRHash | 3 | 4710 |
| SDBMHash | 3 | 4904 |
| RSHash | 3 | 4822 |
| APHash | 2 | 4891 |
| ELFHash | 16 | 4869 |
| JSHash | 3 | 4812 |
| DEKHash | 1 | 4755 |
| FNVHash | 1 | 4803 |
| DJBHash | 1 | 4749 |
| DJB2Hash | 2 | 4817 |
| PJWHash | 16 | 4869 |
測試3:對1000000個隨機(jī)ANSI字符串(無重復(fù),每個字符串最大長度不超過64字符)進(jìn)行散列:
| BKDRHash | 109 |
| SDBMHash | 109 |
| RSHash | 124 |
| APHash | 187 |
| ELFHash | 249 |
| JSHash | 172 |
| DEKHash | 140 |
| FNVHash | 125 |
| DJBHash | 125 |
| DJB2Hash | 125 |
| PJWHash | 234 |
結(jié)論:也許是我的樣本存在一些特殊性,在對ASCII碼字符串進(jìn)行散列時,PJW與ELF Hash(它們其實是同一種算法)無論是質(zhì)量還是效率,都相當(dāng)糟糕;例如:”b5”與“aE”,這兩個字符串按照PJW散列出來的hash值就是一樣的。另外,其它幾種依靠異或來散列的哈希函數(shù),如:JS/DEK/DJB Hash,在對字母與數(shù)字組成的字符串的散列效果也不怎么好。相對而言,還是BKDR與SDBM這類簡單的Hash效率與效果更好。
- http://blog.chinaunix.net/uid-15014334-id-5612796.html
| BKDRHash | 2 | 0 | 4774 | 481 | 96.55 | 100 | 90.95 | 82.05 | 92.64 |
| APHash | 2 | 3 | 4754 | 493 | 96.55 | 88.46 | 100 | 51.28 | 86.28 |
| DJBHash | 2 | 2 | 4975 | 474 | 96.55 | 92.31 | 0 | 100 | 83.43 |
| JSHash | 1 | 4 | 4761 | 506 | 100 | 84.62 | 96.83 | 17.95 | 81.94 |
| RSHash | 1 | 0 | 4861 | 505 | 100 | 100 | 51.58 | 20.51 | 75.96 |
| SDBMHash | 3 | 2 | 4849 | 504 | 93.1 | 92.31 | 57.01 | 23.08 | 72.41 |
| PJWHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
| ELFHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
其中數(shù)據(jù)1為100000個字母和數(shù)字組成的隨機(jī)串哈希沖突個數(shù)。數(shù)據(jù)2為100000個有意義的英文句子哈希沖突個數(shù)。數(shù)據(jù)3為數(shù)據(jù)1的哈希值與 1000003(大素數(shù))求模后存儲到線性表中沖突的個數(shù)。數(shù)據(jù)4為數(shù)據(jù)1的哈希值與10000019(更大素數(shù))求模后存儲到線性表中沖突的個數(shù)。
經(jīng)過比較,得出以上平均得分。平均數(shù)為平方平均數(shù)??梢园l(fā)現(xiàn),BKDRHash無論是在實際效果還是編碼實現(xiàn)中,效果都是最突出的。APHash也是較為優(yōu)秀的算法。DJBHash,JSHash,RSHash與SDBMHash各有千秋。PJWHash與ELFHash效果最差,但得分相似,其算法本質(zhì)是相似的。
資料:
- http://blog.csdn.net/icefireelf/article/details/5796529
- http://blog.csdn.net/alburthoffman/article/details/19641123
- http://blog.chinaunix.net/uid-15014334-id-5612796.html
總結(jié)
- 上一篇: ue编辑器漏洞_7. 编辑器漏洞整理
- 下一篇: GPS 车辆追踪软件 GIS系统和分布式