當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

kl距离 java_KL距离的计算

發布時間：2023/12/10 编程问答 36 豆豆

生活随笔收集整理的這篇文章主要介紹了 kl距离 java_KL距离的计算小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

之前說過Kullback-Leibler，KL距離是Kullback-Leibler差異(Kullback-Leibler Divergence)的簡稱，也叫做相對熵(Relative Entropy)，今天首先用java簡單的實現了兩段文字的KL距離。java代碼如下：

1 importjava.io.BufferedReader;2 importjava.io.FileInputStream;3 importjava.io.FileNotFoundException;4 importjava.io.IOException;5 importjava.io.InputStreamReader;6 importjava.util.ArrayList;7 importjava.util.regex.Matcher;8 importjava.util.regex.Pattern;9 importjeasy.analysis.MMAnalyzer;10

11 public classEntity {12 String word;//存儲字符

13 float pValue;//存儲該字符對應的概率值

14 public Entity()//類的構造函數

15 {16 pValue=0;17 word="";18 }19

20 //讀取文件

21 public static String GetFileText(String path) throwsFileNotFoundException,IOException22 {23 InputStreamReader inStreamReader=new InputStreamReader(new FileInputStream(path),"UTF-8");24 //String strFile1=

25 BufferedReader bufReader=newBufferedReader(inStreamReader);26 String line;27 StringBuilder sb=newStringBuilder();28 while((line=bufReader.readLine())!=null)29 {30 sb.append(line+"　");31 }32 inStreamReader.close();33 bufReader.close();34 String strFile=sb.toString();35 returnstrFile;36 }37

38 //分割字符39 //分詞

40 public static String CutText(String path)throwsFileNotFoundException,IOException41 {42

43 String fileText=GetFileText(path);44 MMAnalyzer analyzer=newMMAnalyzer();45 String result =null;46 String spliter="|";47 try

48 {49 result =analyzer.segment(fileText, spliter);50 }51 catch(IOException e)52 {53 e.printStackTrace();54 }55 //System.out.print(result);

56 returnresult;57 }58 //分單字

59 public static String CutTextSingleCharacter(String path)throwsFileNotFoundException,IOException60 { String text=GetFileText(path);61 String proText=null;62 Pattern pattern=Pattern.compile("[\\u4E00-\\u9FA5\\uF900-\\uFA2D]");63 Matcher m=pattern.matcher(text);64 StringBuffer sb=newStringBuffer();65 Boolean flag=m.find();66 while(flag)67 {68 int start=m.start();69 int end=m.end();70 sb.append(text.substring(start, end)+"|");71 //System.out.println(text.substring(start,end));

72 flag=m.find();73 }74 proText=sb.toString();75 returnproText;76 }77

78 //計算字符的概率

79 public static ArrayList CalcuP(String path) throwsIOException80 { //以詞為單位計算相對熵81 //String result=CutText(path);82 //以字為單位計算相對熵

83 String result=CutTextSingleCharacter(path);84 String []words=result.split("\\|");85

86 ArrayList enList=newArrayList();87 for(String w: words)88 { w=w.trim();89 Entity en=newEntity();90 en.word=w;91 en.pValue=1;92 enList.add(en);93 //System.out.println(w);

94 }95

96 float total=enList.size();97 for(int i=0;i

100 if(!enList.get(i).word.isEmpty())101 {102 for(int j=i+1;j=0;i--)114 {115 if(enList.get(i).pValue<1.0)116 enList.remove(i);117 }118 for(int i=0;i

125 //計算相對熵

126 /*用于計算兩段文本的相對熵*/

127 public static float CalKL(ArrayListp,ArrayListq)128 {129 float kl=0;130 float infinity=10000000;//無窮大

131 double accretion=infinity;//設置熵增加量的初始值為無窮大。132 //從q中找出與p中相對應詞的概率，如果找到了，就將accretion的值更新，并累加到相對熵上面；如果沒找到，則增加了為無窮大

133 for(int i=0;i

=0;j--)137 {138 if(p.get(i).word.equals(q.get(j).word))139 { accretion=p.get(i).pValue*Math.log(p.get(i).pValue/q.get(j).pValue);140 //q.remove(j);

141 break;142 }143 }144 kl+=accretion;145 accretion=infinity;146 }147 }148 returnkl;149 }150

151 //結果分析152 //主函數代碼

153 public static void main(String[] args) throwsFileNotFoundException,154 IOException{155 //TODO Auto-generated method stub156 //TODO Auto-generated method stub;

157 ArrayList enList1=new ArrayList();158 enList1=CalcuP("D:/JavaDemo/KL資料/zhangailing.txt");159 ArrayList enList2=new ArrayList();160 enList2=CalcuP("D:/JavaDemo/KL資料/zhangailing2.txt");161 ArrayListenList3=new ArrayList();162 enList3=CalcuP("D:/JavaDemo/KL資料/maozedong.txt");163 double f1=CalKL(enList1,enList2);164 double f2=CalKL(enList2,enList1);165 double f3=CalKL(enList1,enList3);166 double f4=CalKL(enList3,enList1);167 double f5=CalKL(enList2,enList3);168 double f6=CalKL(enList3,enList2);169 System.out.println("《《小團圓》究竟泄了張愛玲什么“秘密”？》與《《小團圓》：張愛玲的一個夢》的KL距離： "+f1);170 System.out.println("《《小團圓》：張愛玲的一個夢》與《《小團圓》究竟泄了張愛玲什么“秘密”？》的KL距離: "+f2);171 System.out.println("《《小團圓》究竟泄了張愛玲什么“秘密”？》與《1945年毛和蔣介石在重慶談判前的秘密情報戰》的KL距離: "+f3);172 System.out.println("《1945年毛和蔣介石在重慶談判前的秘密情報戰》與《《小團圓》究竟泄了張愛玲什么“秘密”？》的KL距離: "+f4);173 System.out.println("《“小團圓”張愛玲的一個夢》與《1945年毛和蔣介石在重慶談判前的秘密情報戰》的KL距離: "+f5);174 System.out.println("《1945年毛和蔣介石在重慶談判前的秘密情報戰》與《“小團圓”張愛玲的一個夢》的KL距離: "+f6);175 }176 }

下面是結果：

其中第九行和第四十四行代碼在eclipse中會提示錯誤，暫時沒有解決，但是也不影響程序運行，運行也不報錯，在網上查了一下，有說是缺少導入包，但是我導入所需要的包后，還是顯示錯誤，可能是包的版本不對吧，用的是3.6.0版本的，這個問題還有待解決。代碼來源于http://finallyliuyu.iteye.com/blog/609462。

另外，關于MMAnalyzer 中文分詞組件的問題，這個中文分詞組件支持英文、數字、中文(簡體)混合分詞 /常用的數量和人名的匹配 /超過22萬詞的詞庫整理 /實現正向最大匹配算法 .

有關MMAnalyzer的使用如下：

//采用正向最大匹配的中文分詞算法，相當于分詞粒度等于0

MMAnalyzer analyzer = new MMAnalyzer();

//參數為分詞粒度：當字數等于或超過該參數，且能成詞，該詞就被切分出來

MMAnalyzer analyzer = new MMAnalyzer(2);

//增加一個新詞典，采用每行一個詞的讀取方式

MMAnalyzer.addDictionary(reader);

//增加一個新詞

MMAnalyzer.addWord(newWord);

//刪除詞庫中的全部詞語(注意：非常危險的操作，在沒有加載新的詞庫前所有的分詞都將失效)

MMAnalyzer.clear();

//詞庫中是否包含該詞

MMAnalyzer.contains(String word);

//從詞庫中移除該詞

MMAnalyzer.removeWord(String word);

//當前詞庫中包含的詞語總數

MMAnalyzer.size();

另外，對于計算KL距離的matlab代碼如下：

clearvars%generate random data

class_a= randn(30,1);

class_b= 5+randn(30,1);

x=[class_a; class_b];%calculate the params fornormpdf

mu_a=mean(class_a);

mu_b=mean(class_b);

sig_a=std(class_a);

sig_b=std(class_b);

testpoints=linspace(min(x), max(x));%generate mix gaussians

p_mix= normpdf(testpoints,mu_a,sig_a)/2 + normpdf(testpoints,mu_b,sig_b)/2;%calculate two kernel density

[p_ks_default,dum,width_default]=ksdensity(x,testpoints);

p_ks_half_default= ksdensity(x,testpoints,'bandwidth',width_default/2);%calculate histogram probability

[c_hist,centers_hist]= hist(x,20);

p_hist= c_hist/60;%we have 60data instances

p_hist= p_hist + 0.00001;%avoid all the zeros%we need to generate true distribution vector over 20instances provided by histogram

p_mix_20= normpdf(centers_hist,mu_a,sig_a)/2 + normpdf(centers_hist,mu_b,sig_b)/2;

kld_ks_default= sum(p_mix .* log(p_mix ./p_ks_default));

kld_ks_half_default= sum(p_mix .* log(p_mix ./p_ks_half_default));

kld_histo= sum(p_mix_20 .* log(p_mix_20 ./p_hist));

figure

plot(testpoints,p_mix);

title('True distribution');

figure

plot(testpoints,p_ks_default);

title(['Kernel density (default width), KLD = 'num2str(kld_ks_default)]);

figure

plot(testpoints,p_ks_half_default);

title(['Kernel density (half default width), KLD = 'num2str(kld_ks_half_default)]);

figure

hold on

hist(x,20);%plot(centers_hist,p_mix_20);%plot(centers_hist,p_hist);

title(['Histogram (20 bins), KLD = ' num2str(kld_histo)]);

效果如下：

真實分布是用normpdf計算出來的

Kernel Density, 默認寬度

Kernel Density, 默認寬度/2

Histogram

它們的原理是先生成兩個分布,并且生成它們的ksdensity和histogram, 最后計算ksdensity 和 histogram與真實分布的KL距離。

總結

以上是生活随笔為你收集整理的kl距离 java_KL距离的计算的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： CCIR601和CCIR656标准的区别
下一篇： python-pass