java实现Fmeasure计算_聚类结果的评估指标及其JAVA实现
一. 前言
又GET了一項(xiàng)技能。在做聚類算法的時候,由于要評估所提出的聚類算法的好壞,于是需要與一些已知的算法對比,或者用一些人工標(biāo)注的標(biāo)簽來比較,于是用到了聚類結(jié)果的評估指標(biāo)。我了解了以下幾項(xiàng)。
TP:是指被聚在一類的兩個量被正確的分類了(即在標(biāo)準(zhǔn)標(biāo)注里屬于一類的兩個對象被聚在一類)
TN:是指不應(yīng)該被聚在一類的兩個對象被正確地分開了(即在標(biāo)準(zhǔn)標(biāo)注里不是一類的兩個對象在待測結(jié)果也沒聚在一類)
FP:指不應(yīng)該放在一類的對象被錯誤的放在了一類。(即在標(biāo)準(zhǔn)標(biāo)注里不是一類,但在待測結(jié)果里聚在一類)
FN:指不應(yīng)該分開的對象被錯誤的分開了。(即在標(biāo)準(zhǔn)標(biāo)注里是一類,但在待測結(jié)果里沒聚在一類)
P = TP + FP
N = TN + FN
1.準(zhǔn)確率、識別率:(rank Index) ?RI
accuracy = (TP + TN)/(P + N)
2.錯誤率、誤分類率
error rate = (FP + FN)/(P + N)
3.敏感度
sensitivity = TP / P
4.特效性
specificity = TN / N
5.精度
precision = TP ?/ ? (TP + FP)
6.召回率
recall ?= ?TP ?/ ? (TP ?+ FN)
7.RI ?其實(shí)就是 ?1 ?的 accuracy
8.F度量
P為precision
R為recall
9.NMI(normalized mutual information)
10 Jaccard
J = TP ?/ (TP + F)
二、JAVA實(shí)現(xiàn)(未優(yōu)化)
其中很多重復(fù)代碼,還沒有優(yōu)化。。。
package others;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.rmi.CORBA.Util;
import org.graphstream.algorithm.measure.NormalizedMutualInformation;
/*function:常用的聚類評價指標(biāo)有purity, precision, recall, RI 和 F-score,jaccard
* @param:
* @author:Wenbao Li
* @Data:2015-07-13
*/
public class ClusterEvaluation {
public static void main(String[] args){
int[] A = {1,3,3,3,3,3,3,2,1,0,2,0,2,0,2,1,1,0,1,1};
int[] B = {2,2,0,0,0,3,2,2,3,1,3,1,0,1,2,1,0,1,3,3};
double purity = Purity(A,B);
System.out.println("purity\t\t"+purity);
System.out.println("Pre\t\t"+Precision(A,B));
System.out.println("Recall\t\t"+Recall(A,B));
System.out.println("RI(Accuracy)\t\t"+RI(A,B));
System.out.println("Fvalue\t\t"+F_score(A,B));
System.out.println("NMI\t\t"+NMI(A,B));
}
/*
* 計算一個聚類結(jié)果的簇的個數(shù),以及每一簇中的對象個數(shù),
*/
public static Map> clusterDistri(int[] A){
Map> clusterD = new HashMap>();
int max = -1;
for(int i = 0;i< A.length;i++){
if(max < A[i]){
max = A[i];
}
}
for(int i = 0;i< A.length;i++){
int temp = A[i];
if(temp < max+1){
if(clusterD.containsKey(temp)){
Set set = clusterD.get(temp);
set.add(i+1);
clusterD.put(temp, set);
}else{
Set set = new HashSet();
set.add(i+1);
clusterD.put(temp, set);
}
}
}
return clusterD;
}
public static double ClusEvaluate(String method,int[] A,int[] B){
switch(method){
case "Purity":
return Purity(A,B);
case "Precision":
return Precision(A,B);
case "Recall":
return Recall(A,B);
case "RI":
return RI(A,B);
case "F_score":
return F_score(A,B);
case "NMI":
return NMI(A,B);
case "Jaccard":
return Jaccard(A,B);
default:
return -1.0;
}
}
public static int[] commNum(Map> A,Map> B){
int[] commonNo = new int[A.size()];
int com = 0;
Iterator>> itA = A.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
Set setA = entryA.getValue();
Iterator>> itB = B.entrySet().iterator();
int maxComm = -1;
while(itB.hasNext()){
Entry> entryB = itB.next();
Set setB = entryB.getValue();
int lengthA = setA.size();
Set temp = new HashSet(setA);
temp.removeAll(setB);
int lengthCom = lengthA - temp.size();
if(maxComm < lengthCom){
maxComm = lengthCom;
}
}
commonNo[i] = maxComm;
com = com + maxComm;
i++;
}
return commonNo;
}
/*
* 所有簇分配正確的除以總的。其中B是對比的標(biāo)準(zhǔn)標(biāo)簽。
*/
public static double Purity(int[] A,int[] B){
double value;
Map> clusterA = clusterDistri(A);
Map> clusterB = clusterDistri(B);
int[] commonNo = commNum(clusterA,clusterB);
int com = 0;
for(int i = 0;i
com = com + commonNo[i];
}
value = com*1.0/A.length;
return value;
}
/*
* @param A,B
* @return 精度
*/
public static double Precision(int[] A,int[] B){
double value = 0.0;
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
int[] commonNo = commNum(clusterA,clusterB);//得到A中每個簇中聚類正確的數(shù)目。
int allP = 0;
int TP = 0;
int FP = 0;
int TN = 0;
int FN = 0;
Iterator>> itA = clusterA.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
allP = allP + combination(entryA.getValue().size(),2);
TP = TP + combination(commonNo[i],2);
i++;
}
FP = allP - TP;
itA = clusterA.entrySet().iterator();
while(itA.hasNext()){
Entry> entryA = itA.next();
Iterator>> itA2 = clusterA.entrySet().iterator();
while(itA2.hasNext()){
Entry> entryA2 = itA2.next();
if(entryA != entryA2){
Set s1 = entryA.getValue();
Set s2 = entryA2.getValue();
for(Integer i1 :s1){
for(Integer i2:s2){
if(B[i1-1] != B[i2-1]){
TN++;
}else{
FN++;
}
}
}
}
}
}
double P = TP*1.0/(TP + FP);
return P;
}
/*
* @param A,B
* @return recal召回率
*/
public static double Recall(int[] A,int[] B){
double value = 0.0;
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
int[] commonNo = commNum(clusterA,clusterB);//得到A中每個簇中聚類正確的數(shù)目。
int allP = 0;
int TP = 0;
int FP = 0;
int TN = 0;
int FN = 0;
Iterator>> itA = clusterA.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
allP = allP + combination(entryA.getValue().size(),2);
TP = TP + combination(commonNo[i],2);
i++;
}
FP = allP - TP;
itA = clusterA.entrySet().iterator();
while(itA.hasNext()){
Entry> entryA = itA.next();
Iterator>> itA2 = clusterA.entrySet().iterator();
while(itA2.hasNext()){
Entry> entryA2 = itA2.next();
if(entryA != entryA2){
Set s1 = entryA.getValue();
Set s2 = entryA2.getValue();
for(Integer i1 :s1){
for(Integer i2:s2){
if(B[i1-1] != B[i2-1]){
TN++;
}else{
FN++;
}
}
}
}
}
}
double R = TP * 1.0/(TP + FN);
return R;
}
/*
* @param A,B
* @return RankIndex
*/
public static double RI(int[] A,int[] B){
double value = 0.0;
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
int[] commonNo = commNum(clusterA,clusterB);//得到A中每個簇中聚類正確的數(shù)目。
int P = 0;
int TP = 0;
int FP = 0;
int TN = 0;
int FN = 0;
Iterator>> itA = clusterA.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
P = P + combination(entryA.getValue().size(),2);
TP = TP + combination(commonNo[i],2);
i++;
}
FP = P - TP;
itA = clusterA.entrySet().iterator();
while(itA.hasNext()){
Entry> entryA = itA.next();
Iterator>> itA2 = clusterA.entrySet().iterator();
while(itA2.hasNext()){
Entry> entryA2 = itA2.next();
if(entryA != entryA2){
Set s1 = entryA.getValue();
Set s2 = entryA2.getValue();
for(Integer i1 :s1){
for(Integer i2:s2){
if(B[i1-1] != B[i2-1]){
TN++;
}else{
FN++;
}
}
}
}
}
}
value = (TP + TN)*1.0/(TP + FP + FN + TN);
return value;
}
/*
* F值,是對精度和召回率的平衡,
* @param A:評估對象。B:評估標(biāo)準(zhǔn);beta:均衡參數(shù)
* @return F值
*/
public static double F_score(int[] A,int[] B){
double beta = 1.0;
double value = 0.0;
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
int[] commonNo = commNum(clusterA,clusterB);//得到A中每個簇中聚類正確的數(shù)目。
int allP = 0;
int TP = 0;
int FP = 0;
int TN = 0;
int FN = 0;
Iterator>> itA = clusterA.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
allP = allP + combination(entryA.getValue().size(),2);
TP = TP + combination(commonNo[i],2);
i++;
}
FP = allP - TP;
itA = clusterA.entrySet().iterator();
while(itA.hasNext()){
Entry> entryA = itA.next();
Iterator>> itA2 = clusterA.entrySet().iterator();
while(itA2.hasNext()){
Entry> entryA2 = itA2.next();
if(entryA != entryA2){
Set s1 = entryA.getValue();
Set s2 = entryA2.getValue();
for(Integer i1 :s1){
for(Integer i2:s2){
if(B[i1-1] != B[i2-1]){
TN++;
}else{
FN++;
}
}
}
}
}
}
double P = TP*1.0/(TP + FP);
double R = TP * 1.0/(TP + FN);
value = (beta*beta + 1)*P * R/(beta*beta*P + R);
return value;
}
public static double Jaccard(int[] A,int[] B){
double value = 0.0;
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
int[] commonNo = commNum(clusterA,clusterB);//得到A中每個簇中聚類正確的數(shù)目。
int allP = 0;
int TP = 0;
int FP = 0;
int TN = 0;
int FN = 0;
Iterator>> itA = clusterA.entrySet().iterator();
int i = 0;
while(itA.hasNext()){
Entry> entryA = itA.next();
allP = allP + combination(entryA.getValue().size(),2);
TP = TP + combination(commonNo[i],2);
i++;
}
FP = allP - TP;
itA = clusterA.entrySet().iterator();
while(itA.hasNext()){
Entry> entryA = itA.next();
Iterator>> itA2 = clusterA.entrySet().iterator();
while(itA2.hasNext()){
Entry> entryA2 = itA2.next();
if(entryA != entryA2){
Set s1 = entryA.getValue();
Set s2 = entryA2.getValue();
for(Integer i1 :s1){
for(Integer i2:s2){
if(B[i1-1] != B[i2-1]){
TN++;
}else{
FN++;
}
}
}
}
}
}
value = TP * 1.0 / (TP + FP + FN);
return value;
}
public static double NMI(int[] A,int[] B){
Map> clusterA = clusterDistri(A);//得到聚類結(jié)果A的類分布
Map> clusterB = clusterDistri(B);//得到聚類B(標(biāo)準(zhǔn))的類分布
Iterator>> itA = clusterA.entrySet().iterator();
Iterator>> itB = clusterB.entrySet().iterator();
Set> partitionF = new HashSet>();
Set> partitionR = new HashSet>();
int nodeCount = B.length;
while(itA.hasNext()){
Entry> entryA = itA.next();
Set setA = entryA.getValue();
partitionF.add(setA);
setA = null;
entryA = null;
}
while(itB.hasNext()){
Entry> entryB = itB.next();
Set setB = entryB.getValue();
partitionR.add(setB);
setB = null;
entryB = null;
}
return computeNMI(partitionF,partitionR,nodeCount);
}
public static double computeNMI(Set> partitionF,
Set> partitionR,int nodeCount) {
int[][] XY = new int[partitionR.size()][partitionF.size()];
int[] X = new int[partitionR.size()];
int[] Y = new int[partitionF.size()];
int i = 0;
int j = 0;
for (Set com1 : partitionR) {
j = 0;
for (Set com2 : partitionF) {
XY[i][j] = intersect(com1, com2);//待測結(jié)果第i個簇和標(biāo)準(zhǔn)結(jié)果第j個簇的共有元素個數(shù)
X[i] += XY[i][j];//待測結(jié)果第i個簇與所有標(biāo)準(zhǔn)結(jié)果簇的公共元素個數(shù)(感覺就是第i個簇的元素個數(shù))
Y[j] += XY[i][j];//標(biāo)準(zhǔn)結(jié)果簇第j個簇的元素個數(shù)()
j++;
}
i++;
}
int N = nodeCount;
double Ixy = 0;
double Ixy2 = 0;
for (i = 0; i < partitionR.size(); i++) {
for (j = 0; j < partitionF.size(); j++) {
if (XY[i][j] > 0) {
Ixy += ((double) XY[i][j] / N)
* (Math.log((double) XY[i][j] * N / (X[i] * Y[j])) / Math
.log(2.0));
//Ixy2 = (float) (Ixy2 + -2.0D * XY[i][j]
//* Math.log(XY[i][j] * N / X[i] * Y[j]));
}
}
}
//System.out.println(Ixy2);
//double denom = 0.0F;
//for (int ii = 0; ii < X.length; ++ii)
//denom = (double) (denom + X[ii] * Math.log(X[ii] / N));
//for (int jj = 0; jj < Y.length; ++jj) {
//denom = (double) (denom + Y[jj] * Math.log(Y[jj] / N));
//}
//
//System.out.println(denom);
//double M = (Ixy / denom);
//
//return M;
double Hx = 0;
double Hy = 0;
for (i = 0; i < partitionR.size(); i++) {
if (X[i] > 0)
Hx += h((double) X[i] / N);
}
for (j = 0; j < partitionF.size(); j++) {
if (Y[j] > 0)
Hy += h((double) Y[j] / N);
}
double InormXY = Ixy / Math.sqrt(Hx * Hy);
return InormXY;
}
private static double h(double p) {
return -p * (Math.log(p) / Math.log(2.0));
}
/*
* 兩個集合的公共元素個數(shù)
*/
private static int intersect(Set com1, Set com2) {
int num = 0;
for (Integer v1 : com1) {
if (com2.contains(v1))
num++;
}
return num;
}
/*
* C(m,n)=m取n
*/
public static int combination(int m,int n){
int result = 1;
if(m < n){
return -1;
}
result = factorial(m)/(factorial(n)*factorial(m-n));
return result;
}
public static int factorial(int m){
if((m == 1) || (m == 0)){
return 1;
}else if(m < 0){
return -1;
}else{
return m*factorial(m-1);
}
}
}
總結(jié)
以上是生活随笔為你收集整理的java实现Fmeasure计算_聚类结果的评估指标及其JAVA实现的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。