當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

聚类(Clustering):hierarchical clustering 层次聚类及其应用

發布時間：2024/7/5 编程问答 34 豆豆

生活随笔收集整理的這篇文章主要介紹了聚类(Clustering):hierarchical clustering 层次聚类及其应用小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

聚類(Clustering):hierarchical clustering 層次聚類及其應用

clustering實現：

from numpy import * import math #基于matrix計算的pakage#定義樹形結構圖的結點，當再往上兩則分為一類時則將其連接起來 #用面向對象(class)的方法實現此次的代碼: class cluster_node:def __init__ (self,vec,left=None,right=None,distance=0.0,id=None,count=1):#初始化的函數，每次實例化這個類的時候都會自動調用次函數(同java中的構造函數,self同java中的this)'''vec:傳入的數據為numpy array，每一行就是一個vec；left:左節點；right=右節點；distance：結點之間距離id:給節點定義一個名字，算節點平均值時要區分哪些節點包含在哪count：節點計數'''self.vec = vecself.left = leftself.right = rightself.distance = distanceself.id = idself.count = count #only used for weighted average #找出最近的兩點的距離: def L2dist(v1,v2):return math.sqrt(sum((v1-v2)**2)) #2維的兩點(向量計算)的直線距離,向量為坐標形式，一個坐標包含x，y兩個數值def L1dist(v1,v2): #計算一維的距離時使用return sum(abs(v1-v2))# def Chi2dist(v1,v2):# return sqrt(sum((v1-v2)**2))#實現 HC的具體樹狀結構 def hcluster(features,distance=L2dist):'''features:類型為numpy arraydistance:默認為2D距離'''distances = {} #用來儲存所有distance,注意字典distances和函數distance的區別currentid = -1 #初始化實例的clusterid#最開始每一個單獨的數據就是一個單獨的cluster，對應的就是feature(array/matrix)中的每一行==> # clusters are initially just the individual rowsclust = [cluster_node(array(features[i]),id=i) for i in range(len(features))]#注意這里array賦值屬性的用法和for循環的使用方法'''features的每一組[]里的數據對應一個點，每一個點都賦值一個類別i,調用cluster_node'''#進行分類，知道所有分類個數為1時結束循環：while len(clust)>1:lowestpair = (0,1)#初始化以(0,1)為最近的組closest = distance(clust[0].vec,clust[1].vec)#首先取前兩個向量,只有兩個向量，他們的距離也就是最小的(調用了L2dist計算最小距離)# loop through every pair looking for the smallest distancefor i in range(len(clust)):for j in range(len(clust)):# distances is the cache of distance calculationif (clust[i].id,clust[j].id) not in distances:distances[clust[i].id,clust[j].id] = distance(clust[i].vec,clust[j].vec)'''將i,j兩點的距離計算并傳到字典distances中'''d = distances[(clust[i].id,clust[j].id)]if d < closest: #比較出最小距離,賦值給closest,同時更新最近距離的對組closest = dlowestpair = (i,j) # calculate the average of the two clusters.有多種定義分類的方法(最近距離、最遠距離、均值、中值),這里取平均距離print(clust[lowestpair[0].vec[0]])mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]# create the new clusternewcluster = cluster_node(array(mergevec),left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=closest,id=currentid)# cluster ids that weren't in the original set are negative# 不在原始的數據集合中currentid為負數currentid -= 1#每執行完一次結點計算就將之前的結點坐標值刪除？del clust[lowestpair[0]]del clust[lowestpair[1]]clust.append(newcluster)return clust[0]# (以上已經建立好樹狀圖了)def extract_cluster(clust,dist): #dist為預設的距離值# extract list of sub-tree clusters from hcluster tree with distance<distcluster = {}if clust.distance<dist:# we have found a cluster subtreereturn [clust]else:# check the right and left branchescl = []cr = []if clust.left != None:#這里可以直接.left？cl = extract_cluster(clust.left,dist=dist)if clust.right != None:cr = extract_cluster(clust.right,dist=dist)return cl+crdef get_cluster_elements(clust):# return ids for elements in a cluster sub-tree# 如果該要求的clust沒有子集，那就返回他本身，如果有子集就返回左子集和右子集相加if clust.id>=0:# positive id means that this is a leafreturn clust.idelse:# check the right and left branchescl = []cr = []if clust.left != None:cl = get_cluster_elements(clust.left)if clust.right !=None:cr = get_cluster_elements(clust.right)return cl+crdef printclust(clust,labels=None,n=0):for i in range(n): print(''),if clust.id<0:# negative id means that this is branchprint('-')else:# positive id means that this is an endpointif labels==None: print(clust.id)else: print(labels[clust.id])if clust.left != None: printclust(clust.left,labels=labels,n=n+1)if clust.right !=None: printclust(clust.right,labels=labels,n=n+1)def getheight(clust):# Is this an endpoint? Then the height is just 1if clust.left == None and clust.right ==None: return 1# Otherwise the height is the same of the heights of each branchreturn getheight(clust.left)+getheight(clust.right) #為什么還要加上getheight這個函數？def getdepth(clust): #深度是？# The distance of an endpoint is 0.0if clust.left == None and clust.right == None: return 0 # The distance of a branch is the greater of its two sides plus its own distance return max(getdepth(clust.left),getdepth(clust.right))+clust.distance

clustering代碼應用:(借用鏈接：https://blog.csdn.net/weixin_41790863/article/details/81412564 )

from PIL import ImageDraw, Image import numpy as np import os import sysnodeList = [] # 用于存儲所有的節點，包含圖片節點，與聚類后的節點 distance = {} # 用于存儲所有每兩個節點的距離，數據格式{(node1.id,node2.id):30.0,(node2.id,node3.id):40.0}class node:def __init__(self, data):'''每個樣本及樣本合并后節點的類data：接受兩種格式，1、當為字符（string）時，是圖片的地址，同時也表示這個節點就是圖片2、合并后的類，傳入的格式為(leftNode,rightNode) 即當前類表示合并后的新類，而對應的左右節點就是子節點'''self.id = len(nodeList) # 設置一個ID,以nodeList當然長度為ID,在本例中ID本身沒太大用處，只是如果看代碼時，有時要看指向時有點用self.parent = None # 指向合并后的類self.pos = None # 用于最后繪制節構圖使用，賦值時為(x,y,w,h)格式if type(data) == type(""):'''節點為圖片'''self.imgData = Image.open(data)self.left = Noneself.right = Noneself.level = 0 # 圖片為最終的子節點，所有圖片的層級都為0，設置層級是為了最終繪制結構圖npTmp = np.array(self.imgData).reshape(-1, 3) # 將圖片數據轉化為numpy數據，shape為(高，寬，3)，3為顏色通道npTmp = npTmp.reshape(-1, 3) # 重新排列，shape為(高*寬，3)self.feature = npTmp.mean(axis=0) # 計算RGB三個顏色通道均值else:'''節點為合成的新類'''self.imgData = Noneself.left = data[0]self.right = data[1]self.left.parent = selfself.right.parent = selfself.level = max(self.left.level, self.right.level) + 1 # 層級為左右節高層級的級數+1self.feature = (self.left.feature + self.right.feature) / 2 # 兩類的合成一類時，就是左右節點的feature相加/2# 計算該類與每個其他類的距離，并存入distancefor x in nodeList:distance[(x, self)] = np.sqrt(np.sum((x.feature - self.feature) ** 2))nodeList.append(self) # 將本類加入nodeList變量def drawNode(self, img, draw, vLineLenght):# 繪制結構圖if self.pos == None: returnif self.left == None:# 如果是圖片self.imgData.thumbnail((self.pos[2], self.pos[3])) #thumbnail將圖片變小成縮略圖img.paste(self.imgData, (self.pos[0], self.pos[1]))draw.line((int(self.pos[0] + self.pos[2] / 2), self.pos[1] - vLineLenght, int(self.pos[0] + self.pos[2] / 2), self.pos[1]), fill=(255, 0, 0))else:# 如果不是圖片draw.line((int(self.pos[0]), self.pos[1], int(self.pos[0] + self.pos[2]), self.pos[1]), fill=(255, 0, 0))draw.line((int(self.pos[0] + self.pos[2] / 2), self.pos[1], int(self.pos[0] + self.pos[2] / 2), self.pos[1] - self.pos[3]), fill=(255, 0, 0))def loadImg(path):'''path 圖片目錄，根據自己存的地方改寫'''files = Nonetry:files = os.listdir(path)except:print('未正確讀取目錄：' + path + ',圖片目錄，請根據自己存的地方改寫,并保證沒有hierarchicalResult.jpg,該文件為最后生成文件')return Nonefor i in files:if os.path.splitext(i)[1].lower() == '.jpg' and os.path.splitext(i)[0].lower() != 'hierarchicalresult':fileName = os.path.join(path, i)node(fileName)return os.path.join(path, 'hierarchicalResult.jpg')def getMinDistance():'''從distance中過濾出未分類的結點，并讀取最小的距離'''vars = list(filter(lambda x: x[0].parent == None and x[1].parent == None, distance))minDist = vars[0]for x in vars:if minDist == None or distance[x] < distance[minDist]:minDist = xreturn minDistdef createTree():while len(list(filter(lambda x: x.parent == None, nodeList))) > 1: # 合并到最后時，只有一個類，只要有兩個以上未合并，就循環minDist = getMinDistance()# 創建非圖片的節點，之所以把[1]做為左節點，因為繪圖時的需要，# 在不斷的產生非圖片節點時，在nodeList的后面的一般是新節點，但繪圖時繪在左邊node((minDist[1], minDist[0]))return nodeList[-1] # 最后一個插入的節點就是要節點def run():root = createTree() # 創建樹結構# 一句話的PYTON，實現二叉樹的左右根遍歷，通過通過遍歷，進行排序后，取出圖片，做為最底層的打印sortTree = lambda node: ([] if node.left == None else sortTree(node.left)) + ([] if node.right == None else sortTree(node.right)) + [node]treeTmp = sortTree(root)treeTmp = list(filter(lambda x: x.left == None, treeTmp)) # 沒有左節點的，即為圖片thumbSize = 60 # 縮略圖的大小，，在60X60的小格內縮放thumbSpace = 20 # 縮略圖間距vLineLenght = 80 # 上下節點，即每個level之間的高度imgWidth = len(treeTmp) * (thumbSize + thumbSpace)imgHeight = (root.level + 1) * vLineLenght + thumbSize + thumbSpace * 2img = Image.new('RGB', (imgWidth, imgHeight), (255, 255, 255))draw = ImageDraw.Draw(img)for item in enumerate(treeTmp):# 為所有圖片增加繪圖數據x = item[0] * (thumbSize + thumbSpace) + thumbSpace / 2y = imgHeight - thumbSize - thumbSpace / 2 - ((item[1].parent.level - 1) * vLineLenght)w = item[1].imgData.widthh = item[1].imgData.heightif w > h:h = h / w * thumbSizew = thumbSizeelse:w = w / h * thumbSizeh = thumbSizex += (thumbSize - w) / 2item[1].pos = (int(x), int(y), int(w), int(h))item[1].drawNode(img, draw, vLineLenght)for x in range(1, root.level + 1):# 為所有非圖片增加繪圖的數據items = list(filter(lambda i: i.level == x, nodeList))for item in items:x = item.left.pos[0] + (item.left.pos[2] / 2)w = item.right.pos[0] + (item.right.pos[2] / 2) - xy = item.left.pos[1] - (item.level - item.left.level) * vLineLenghth = ((item.parent.level if item.parent != None else item.level + 1) - item.level) * vLineLenghtitem.pos = (int(x), int(y), int(w), int(h))item.drawNode(img, draw, vLineLenght)img.save(resultFile)resultFile = loadImg(r"G:\Pythonnotes\test\HierarchicalClusterDataset") # 讀取數據，并返回最后結果要存儲的文件名，目錄根據自己存的位置進行修改 if resultFile != 'None':run()print("結構圖生成成功，最終結構圖存儲于：" + resultFile)

總結

以上是生活随笔為你收集整理的聚类(Clustering):hierarchical clustering 层次聚类及其应用的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：智慧交通day02-车流量检测实现04：
下一篇： ubyntu 链接mysql_ubunt