當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

hash编码

發布時間：2023/12/10 编程问答 22 豆豆

生活随笔收集整理的這篇文章主要介紹了 hash编码小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

常用的字符串Hash函數還有ELFHash，APHash等等，都是十分簡單有效的方法。這些函數使用位運算使得每一個字符都對最后的函數值產生影響。另外還有以MD5和SHA1為代表的雜湊函數，這些函數幾乎不可能找到碰撞。

常用字符串哈希函數有 BKDRHash，APHash，DJBHash，JSHash，RSHash，SDBMHash，PJWHash，ELFHash等等。對于以上幾種哈希函數，我對其進行了一個小小的評測。

Hash函數	數據1	數據2	數據3	數據4	數據1得分	數據2得分	數據3得分	數據4得分	平均分
BKDRHash	2	0	4774	481	96.55	100	90.95	82.05	92.64
APHash	2	3	4754	493	96.55	88.46	100	51.28	86.28
DJBHash	2	2	4975	474	96.55	92.31	0	100	83.43
JSHash	1	4	4761	506	100	84.62	96.83	17.95	81.94
RSHash	1	0	4861	505	100	100	51.58	20.51	75.96
SDBMHash	3	2	4849	504	93.1	92.31	57.01	23.08	72.41
PJWHash	30	26	4878	513	0	0	43.89	0	21.95
ELFHash	30	26	4878	513	0	0	43.89	0	21.95

其中數據1為100000個字母和數字組成的隨機串哈希沖突個數。數據2為100000個有意義的英文句子哈希沖突個數。數據3為數據1的哈希值與 1000003(大素數)求模后存儲到線性表中沖突的個數。數據4為數據1的哈希值與10000019(更大素數)求模后存儲到線性表中沖突的個數。

經過比較，得出以上平均得分。平均數為平方平均數?？梢园l現，BKDRHash無論是在實際效果還是編碼實現中，效果都是最突出的。APHash也是較為優秀的算法。DJBHash,JSHash,RSHash與SDBMHash各有千秋。PJWHash與ELFHash效果最差，但得分相似，其算法本質是相似的。

HashFun.h

unsigned?int?SDBMHash(char?*str)
{
????unsigned?int?hash?=?0;

????while?(*str)
????{
????????//?equivalent?to:?hash?=?65599*hash?+?(*str++);
????????hash?=?(*str++)?+?(hash?<<?6)?+?(hash?<<?16)?-?hash;
????}

????return?(hash?&?0x7FFFFFFF);
}

//?RS?Hash?Function
unsigned?int?RSHash(char?*str)
{
????unsigned?int?b?=?378551;
????unsigned?int?a?=?63689;
????unsigned?int?hash?=?0;

????while?(*str)
????{
????????hash?=?hash?*?a?+?(*str++);
????????a?*=?b;
????}

????return?(hash?&?0x7FFFFFFF);
}

//?JS?Hash?Function
unsigned?int?JSHash(char?*str)
{
????unsigned?int?hash?=?1315423911;

????while?(*str)
????{
????????hash?^=?((hash?<<?5)?+?(*str++)?+?(hash?>>?2));
????}

????return?(hash?&?0x7FFFFFFF);
}

//?P.?J.?Weinberger?Hash?Function
unsigned?int?PJWHash(char?*str)
{
????unsigned?int?BitsInUnignedInt?=?(unsigned?int)(sizeof(unsigned?int)?*?8);
????unsigned?int?ThreeQuarters????=?(unsigned?int)((BitsInUnignedInt??*?3)?/?4);
????unsigned?int?OneEighth????????=?(unsigned?int)(BitsInUnignedInt?/?8);
????unsigned?int?HighBits?????????=?(unsigned?int)(0xFFFFFFFF)?<<?(BitsInUnignedInt?-?OneEighth);
????unsigned?int?hash?????????????=?0;
????unsigned?int?test?????????????=?0;

????while?(*str)
????{
????????hash?=?(hash?<<?OneEighth)?+?(*str++);
????????if?((test?=?hash?&?HighBits)?!=?0)
????????{
????????????hash?=?((hash?^?(test?>>?ThreeQuarters))?&?(~HighBits));
????????}
????}

????return?(hash?&?0x7FFFFFFF);
}

//?ELF?Hash?Function
unsigned?int?ELFHash(char?*str)
{
????unsigned?int?hash?=?0;
????unsigned?int?x????=?0;

????while?(*str)
????{
????????hash?=?(hash?<<?4)?+?(*str++);
????????if?((x?=?hash?&?0xF0000000L)?!=?0)
????????{
????????????hash?^=?(x?>>?24);
????????????hash?&=?~x;
????????}
????}

????return?(hash?&?0x7FFFFFFF);
}

//?BKDR?Hash?Function
unsigned?int?BKDRHash(char?*str)
{
????unsigned?int?seed?=?131;?//?31?131?1313?13131?131313?etc..
????unsigned?int?hash?=?0;

????while?(*str)
????{
????????hash?=?hash?*?seed?+?(*str++);
????}

????return?(hash?&?0x7FFFFFFF);
}

//?DJB?Hash?Function
unsigned?int?DJBHash(char?*str)
{
????unsigned?int?hash?=?5381;

????while?(*str)
????{
????????hash?+=?(hash?<<?5)?+?(*str++);
????}

????return?(hash?&?0x7FFFFFFF);
}

//?AP?Hash?Function
unsigned?int?APHash(char?*str)
{
????unsigned?int?hash?=?0;
????int?i;

????for?(i=0;?*str;?i++)
????{
????????if?((i?&?1)?==?0)
????????{
????????????hash?^=?((hash?<<?7)?^?(*str++)?^?(hash?>>?3));
????????}
????????else
????????{
????????????hash?^=?(~((hash?<<?11)?^?(*str++)?^?(hash?>>?5)));
????????}
????}
????return?(hash?&?0x7FFFFFFF);
}

以上為轉載內容以下是自己測試Hash算法效果的函數。

main.cpp

#include<iostream>
#include<fstream>
#include<cstdlib>
#include<cstring>
#include<ctime>
#include"HashFun.h"
using?namespace?std;
/*
??從Dict.txt讀入英文單詞表，總共24018個單詞
??測試哈希函數的分布情況

*/
void?ReadDict(char?**?dict,int&?len)
{
??fstream?dictFile;
??dictFile.open("Dict.txt",ios::in);
??if(!dictFile)??{??cout<<"OpenFailed."<<endl;??exit(-1);}
??char?tmpString[50];
??len=0;
??int?sl;
??while(!dictFile.eof())
??{
????dictFile.getline(tmpString,50);

????sl=strlen(tmpString);
????if(sl>0?&&?tmpString[sl-1]=='\r')
????????????{
????????????????tmpString[sl-1]='\0';
????????????????--sl;
????????????}
????if(sl>0)
????{
????????dict[len]=new?char[sl];
????????????if(dict[len]==NULL)?{???cout<<"Memory?fail."<<endl;exit(-1);}
????????strcpy(dict[len],tmpString);
????????//cout<<dict[len]<<endl;
????????len++;
????}
??}
??dictFile.close();
}

const?int?MaxHashLen=49999;
void?Analise(unsigned?int?HashHeight[])
{
????const?unsigned?int?AreaNum=20;
????unsigned?int?AreaHeight[AreaNum];
????const?unsigned?int?MaxRecordHeight=10;???????//記錄各自出現高度超過10的放在一起統計
????unsigned??int?NumOfHeight[MaxRecordHeight+1];
????memset(AreaHeight,0,sizeof(AreaHeight));
????memset(NumOfHeight,0,?sizeof(NumOfHeight));
????unsigned?int?step?=?MaxHashLen/AreaNum+1,wordNum=0,MaxHeight=0;

????for(int?i=0;i<MaxHashLen;i++)
????{
????????wordNum+=HashHeight[i];
????????if(?HashHeight[i]>=MaxRecordHeight?)
????????????NumOfHeight[MaxRecordHeight]++;
????????else
????????????NumOfHeight[HashHeight[i]]++;
????????if(HashHeight[i]>MaxHeight)
????????????MaxHeight=HashHeight[i];

????????AreaHeight[i/step]+=HashHeight[i];

????}
????int?searchtime=0;
????for(unsigned?int?i=1;i<MaxRecordHeight;i++)
????????searchtime+=(i+1)*i*NumOfHeight[i];?//有i*NumOfHeight[i]個的單詞在高度為i的鏈表里
????searchtime+=((MaxRecordHeight+MaxHeight)/2+1)*(MaxRecordHeight+MaxHeight)/2*NumOfHeight[MaxRecordHeight];
????searchtime/=2;
????cout<<"--------Hash分布如下"<<endl;
????for(unsigned?int?i=0;i<AreaNum;i++)
????????cout<<"[?"<<i*step<<"?,?"<<(i+1)*?step?<<"?]:?"<<AreaHeight[i]<<endl;

????cout<<"--------鏈表高度情況"<<endl;
????for(unsigned?int?i=0;i<MaxRecordHeight;i++)
????????cout<<"高度為:?"<<i<<?"的鏈表個數為?"?<<NumOfHeight[i]<<endl;
????cout<<"高度大于?"<<MaxRecordHeight<<?"的鏈表個數為?"?<<NumOfHeight[MaxRecordHeight]<<endl;

????cout<<"--------鏈表最高高度："<<MaxHeight<<endl;
????cout<<"--------單詞表個數為："<<wordNum<<endl;
????cout<<"--------平均鏈表高度："<<float(wordNum)/(MaxHashLen-NumOfHeight[0])<<endl;
????cout<<"--------鏈表的利用率："<<1-float(NumOfHeight[0])/MaxHashLen<<endl;
????cout<<"--------平均查找次數："<<1.0*searchtime/wordNum<<endl;
}

int?main()
{
????char?*dict[24100];????//存儲單詞表
??int?wordNum;
??unsigned?int?HashHeight[MaxHashLen],hashvalue;
??memset(HashHeight,0,sizeof(HashHeight));
??ReadDict(dict,wordNum);
????unsigned?int?startTime,curTime;
????startTime=clock();
????for(int?i=0;i<wordNum;i++)
????{
????????hashvalue?=?SDBMHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?RSHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?JSHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?PJWHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?ELFHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?BKDRHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?DJBHash(?dict[i]?)?%?MaxHashLen;
//????????hashvalue?=?APHash(?dict[i]?)?%?MaxHashLen;
????????HashHeight[?hashvalue?]++;
????????//cout<<hashvalue<<endl;
????}
????curTime=clock();
????Analise(HashHeight);
????cout<<"cost?time："<<(curTime-startTime)/1000?<<"ms"<<endl;
????//for(int?i=0;i<MaxHashLen;i++)
??????//??cout<<HashHeight[i]<<endl;
??return?0;
}

我的測試說明

測試的內容，是從英漢詞典Dict.txt讀取24018個單詞，將哈希值對49999求余，觀察Hash算法的散列情況

英漢詞典Dict.txt在這里下載：http://wenku.baidu.com/view/4ff90a9851e79b8968022695.html

測試結果如下：

方法	最高高度	平均查找次數
BKDRHash	6	1.2383
SDBMHash	5	1.2387
JSHash	6	1.2397
DJBHash	5	1.2403
RSHash	5	1.2417
PJWHash	5	1.2427
ELFHash	5	1.2427
APHash	6	1.2436

轉自：

http://apps.hi.baidu.com/share/detail/39630021

http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html

轉載于:https://www.cnblogs.com/dongtian0359/archive/2011/07/10/2102192.html

總結

以上是生活随笔為你收集整理的hash编码的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

Hash

上一篇： Java2实用教程（第二版）程序代码——
下一篇：如何备份linux系统（转）