聚类(Clustering):hierarchical clustering 层次聚类及其应用
生活随笔
收集整理的這篇文章主要介紹了
聚类(Clustering):hierarchical clustering 层次聚类及其应用
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
聚類(Clustering):hierarchical clustering 層次聚類及其應用
clustering實現:
from numpy import * import math #基于matrix計算的pakage#定義樹形結構圖的結點,當再往上兩則分為一類時則將其連接起來 #用面向對象(class)的方法實現此次的代碼: class cluster_node:def __init__ (self,vec,left=None,right=None,distance=0.0,id=None,count=1):#初始化的函數,每次實例化這個類的時候都會自動調用次函數(同java中的構造函數,self同java中的this)'''vec:傳入的數據為numpy array,每一行就是一個vec;left:左節點;right=右節點;distance:結點之間距離id:給節點定義一個名字,算節點平均值時要區分哪些節點包含在哪count:節點計數'''self.vec = vecself.left = leftself.right = rightself.distance = distanceself.id = idself.count = count #only used for weighted average #找出最近的兩點的距離: def L2dist(v1,v2):return math.sqrt(sum((v1-v2)**2)) #2維的兩點(向量計算)的直線距離,向量為坐標形式,一個坐標包含x,y兩個數值def L1dist(v1,v2): #計算一維的距離時使用return sum(abs(v1-v2))# def Chi2dist(v1,v2):# return sqrt(sum((v1-v2)**2))#實現 HC的具體樹狀結構 def hcluster(features,distance=L2dist):'''features:類型為numpy arraydistance:默認為2D距離'''distances = {} #用來儲存所有distance,注意字典distances和函數distance的區別currentid = -1 #初始化實例的clusterid#最開始每一個單獨的數據就是一個單獨的cluster,對應的就是feature(array/matrix)中的每一行==> # clusters are initially just the individual rowsclust = [cluster_node(array(features[i]),id=i) for i in range(len(features))]#注意這里array賦值屬性的用法和for循環的使用方法'''features的每一組[]里的數據對應一個點,每一個點都賦值一個類別i,調用cluster_node'''#進行分類,知道所有分類個數為1時結束循環:while len(clust)>1:lowestpair = (0,1)#初始化以(0,1)為最近的組closest = distance(clust[0].vec,clust[1].vec)#首先取前兩個向量,只有兩個向量,他們的距離也就是最小的(調用了L2dist計算最小距離)# loop through every pair looking for the smallest distancefor i in range(len(clust)):for j in range(len(clust)):# distances is the cache of distance calculationif (clust[i].id,clust[j].id) not in distances:distances[clust[i].id,clust[j].id] = distance(clust[i].vec,clust[j].vec)'''將i,j兩點的距離計算并傳到字典distances中'''d = distances[(clust[i].id,clust[j].id)]if d < closest: #比較出最小距離,賦值給closest,同時更新最近距離的對組closest = dlowestpair = (i,j) # calculate the average of the two clusters.有多種定義分類的方法(最近距離、最遠距離、均值、中值),這里取平均距離print(clust[lowestpair[0].vec[0]])mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]# create the new clusternewcluster = cluster_node(array(mergevec),left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=closest,id=currentid)# cluster ids that weren't in the original set are negative# 不在原始的數據集合中currentid為負數currentid -= 1#每執行完一次結點計算就將之前的結點坐標值刪除?del clust[lowestpair[0]]del clust[lowestpair[1]]clust.append(newcluster)return clust[0]# (以上已經建立好樹狀圖了)def extract_cluster(clust,dist): #dist為預設的距離值# extract list of sub-tree clusters from hcluster tree with distance<distcluster = {}if clust.distance<dist:# we have found a cluster subtreereturn [clust]else:# check the right and left branchescl = []cr = []if clust.left != None:#這里可以直接.left?cl = extract_cluster(clust.left,dist=dist)if clust.right != None:cr = extract_cluster(clust.right,dist=dist)return cl+crdef get_cluster_elements(clust):# return ids for elements in a cluster sub-tree# 如果該要求的clust沒有子集,那就返回他本身,如果有子集就返回左子集和右子集相加if clust.id>=0:# positive id means that this is a leafreturn clust.idelse:# check the right and left branchescl = []cr = []if clust.left != None:cl = get_cluster_elements(clust.left)if clust.right !=None:cr = get_cluster_elements(clust.right)return cl+crdef printclust(clust,labels=None,n=0):for i in range(n): print(''),if clust.id<0:# negative id means that this is branchprint('-')else:# positive id means that this is an endpointif labels==None: print(clust.id)else: print(labels[clust.id])if clust.left != None: printclust(clust.left,labels=labels,n=n+1)if clust.right !=None: printclust(clust.right,labels=labels,n=n+1)def getheight(clust):# Is this an endpoint? Then the height is just 1if clust.left == None and clust.right ==None: return 1# Otherwise the height is the same of the heights of each branchreturn getheight(clust.left)+getheight(clust.right) #為什么還要加上getheight這個函數?def getdepth(clust): #深度是?# The distance of an endpoint is 0.0if clust.left == None and clust.right == None: return 0 # The distance of a branch is the greater of its two sides plus its own distance return max(getdepth(clust.left),getdepth(clust.right))+clust.distanceclustering代碼應用:(借用鏈接:https://blog.csdn.net/weixin_41790863/article/details/81412564 )
from PIL import ImageDraw, Image import numpy as np import os import sysnodeList = [] # 用于存儲所有的節點,包含圖片節點,與聚類后的節點 distance = {} # 用于存儲所有每兩個節點的距離,數據格式{(node1.id,node2.id):30.0,(node2.id,node3.id):40.0}class node:def __init__(self, data):'''每個樣本及樣本合并后節點的類data:接受兩種格式,1、當為字符(string)時,是圖片的地址,同時也表示這個節點就是圖片2、合并后的類,傳入的格式為(leftNode,rightNode) 即當前類表示合并后的新類,而對應的左右節點就是子節點'''self.id = len(nodeList) # 設置一個ID,以nodeList當然長度為ID,在本例中ID本身沒太大用處,只是如果看代碼時,有時要看指向時有點用self.parent = None # 指向合并后的類self.pos = None # 用于最后繪制節構圖使用,賦值時為(x,y,w,h)格式if type(data) == type(""):'''節點為圖片'''self.imgData = Image.open(data)self.left = Noneself.right = Noneself.level = 0 # 圖片為最終的子節點,所有圖片的層級都為0,設置層級是為了最終繪制結構圖npTmp = np.array(self.imgData).reshape(-1, 3) # 將圖片數據轉化為numpy數據,shape為(高,寬,3),3為顏色通道npTmp = npTmp.reshape(-1, 3) # 重新排列,shape為(高*寬,3)self.feature = npTmp.mean(axis=0) # 計算RGB三個顏色通道均值else:'''節點為合成的新類'''self.imgData = Noneself.left = data[0]self.right = data[1]self.left.parent = selfself.right.parent = selfself.level = max(self.left.level, self.right.level) + 1 # 層級為左右節高層級的級數+1self.feature = (self.left.feature + self.right.feature) / 2 # 兩類的合成一類時,就是左右節點的feature相加/2# 計算該類與每個其他類的距離,并存入distancefor x in nodeList:distance[(x, self)] = np.sqrt(np.sum((x.feature - self.feature) ** 2))nodeList.append(self) # 將本類加入nodeList變量def drawNode(self, img, draw, vLineLenght):# 繪制結構圖if self.pos == None: returnif self.left == None:# 如果是圖片self.imgData.thumbnail((self.pos[2], self.pos[3])) #thumbnail將圖片變小成縮略圖img.paste(self.imgData, (self.pos[0], self.pos[1]))draw.line((int(self.pos[0] + self.pos[2] / 2), self.pos[1] - vLineLenght, int(self.pos[0] + self.pos[2] / 2), self.pos[1]), fill=(255, 0, 0))else:# 如果不是圖片draw.line((int(self.pos[0]), self.pos[1], int(self.pos[0] + self.pos[2]), self.pos[1]), fill=(255, 0, 0))draw.line((int(self.pos[0] + self.pos[2] / 2), self.pos[1], int(self.pos[0] + self.pos[2] / 2), self.pos[1] - self.pos[3]), fill=(255, 0, 0))def loadImg(path):'''path 圖片目錄,根據自己存的地方改寫'''files = Nonetry:files = os.listdir(path)except:print('未正確讀取目錄:' + path + ',圖片目錄,請根據自己存的地方改寫,并保證沒有hierarchicalResult.jpg,該文件為最后生成文件')return Nonefor i in files:if os.path.splitext(i)[1].lower() == '.jpg' and os.path.splitext(i)[0].lower() != 'hierarchicalresult':fileName = os.path.join(path, i)node(fileName)return os.path.join(path, 'hierarchicalResult.jpg')def getMinDistance():'''從distance中過濾出未分類的結點,并讀取最小的距離'''vars = list(filter(lambda x: x[0].parent == None and x[1].parent == None, distance))minDist = vars[0]for x in vars:if minDist == None or distance[x] < distance[minDist]:minDist = xreturn minDistdef createTree():while len(list(filter(lambda x: x.parent == None, nodeList))) > 1: # 合并到最后時,只有一個類,只要有兩個以上未合并,就循環minDist = getMinDistance()# 創建非圖片的節點,之所以把[1]做為左節點,因為繪圖時的需要,# 在不斷的產生非圖片節點時,在nodeList的后面的一般是新節點,但繪圖時繪在左邊node((minDist[1], minDist[0]))return nodeList[-1] # 最后一個插入的節點就是要節點def run():root = createTree() # 創建樹結構# 一句話的PYTON,實現二叉樹的左右根遍歷,通過通過遍歷,進行排序后,取出圖片,做為最底層的打印sortTree = lambda node: ([] if node.left == None else sortTree(node.left)) + ([] if node.right == None else sortTree(node.right)) + [node]treeTmp = sortTree(root)treeTmp = list(filter(lambda x: x.left == None, treeTmp)) # 沒有左節點的,即為圖片thumbSize = 60 # 縮略圖的大小,,在60X60的小格內縮放thumbSpace = 20 # 縮略圖間距vLineLenght = 80 # 上下節點,即每個level之間的高度imgWidth = len(treeTmp) * (thumbSize + thumbSpace)imgHeight = (root.level + 1) * vLineLenght + thumbSize + thumbSpace * 2img = Image.new('RGB', (imgWidth, imgHeight), (255, 255, 255))draw = ImageDraw.Draw(img)for item in enumerate(treeTmp):# 為所有圖片增加繪圖數據x = item[0] * (thumbSize + thumbSpace) + thumbSpace / 2y = imgHeight - thumbSize - thumbSpace / 2 - ((item[1].parent.level - 1) * vLineLenght)w = item[1].imgData.widthh = item[1].imgData.heightif w > h:h = h / w * thumbSizew = thumbSizeelse:w = w / h * thumbSizeh = thumbSizex += (thumbSize - w) / 2item[1].pos = (int(x), int(y), int(w), int(h))item[1].drawNode(img, draw, vLineLenght)for x in range(1, root.level + 1):# 為所有非圖片增加繪圖的數據items = list(filter(lambda i: i.level == x, nodeList))for item in items:x = item.left.pos[0] + (item.left.pos[2] / 2)w = item.right.pos[0] + (item.right.pos[2] / 2) - xy = item.left.pos[1] - (item.level - item.left.level) * vLineLenghth = ((item.parent.level if item.parent != None else item.level + 1) - item.level) * vLineLenghtitem.pos = (int(x), int(y), int(w), int(h))item.drawNode(img, draw, vLineLenght)img.save(resultFile)resultFile = loadImg(r"G:\Pythonnotes\test\HierarchicalClusterDataset") # 讀取數據,并返回最后結果要存儲的文件名,目錄根據自己存的位置進行修改 if resultFile != 'None':run()print("結構圖生成成功,最終結構圖存儲于:" + resultFile)總結
以上是生活随笔為你收集整理的聚类(Clustering):hierarchical clustering 层次聚类及其应用的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 智慧交通day02-车流量检测实现04:
- 下一篇: ubyntu 链接mysql_ubunt