當前位置：首頁 > 编程资源 > 综合教程 >内容正文

综合教程

k-means

發布時間：2023/12/13 综合教程 27 生活家

生活随笔收集整理的這篇文章主要介紹了 k-means 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

https://www.cnblogs.com/zy230530/p/7029025.html

k-means算法中的k表示聚類為k個簇，means代表取每一個聚類中數據的均值作為該簇的中心（質心）即用每一個類的質心對該簇進行描述。k-means算法的原理比較簡單，但它有缺陷，即其可能收斂到局部最優解（局部最優不如全局最優效果好），且在大規模數據集上收斂速度相對較慢。換種說法，k-means算法是受初始值影響的局部最優的迭代算法。偽代碼實現：

創建k個初始值作為初始質心（要位于數據邊界內）
    if 任意一個點的簇分配結果發生改變：
        遍歷數據集中的每一個點：
            遍歷k個質心：
                計算質心與數據點之間的距離
            將數據點分配到距離其最近的質心的簇
        遍歷k個簇：
            計算每個簇中所有點的均值
            得到的k個均值更新為新的質心

https://www.cnblogs.com/pinard/p/6169370.html

R語言代碼
library(tidyverse)
library(corrplot)
library(gridExtra)
library(GGally)
library(knitr)
wines <- read.csv('/home/zwt/PycharmProjects/test/data/wine.data')
wines <- wines[2:14]    #刪除第一列，種類列
head(wines)  
tail(wines)
summary(wines)
str(wines)
#畫每個屬性的直方圖
wines %>%
  gather(Attributes, value, 1:13) %>%
  ggplot(aes(x=value, fill=Attributes)) +
  geom_histogram(colour="black", show.legend=FALSE) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Frequency",
       title="Wines Attributes - Histograms") +
  theme_bw()
#密度圖
wines %>%
  gather(Attributes, value, 1:13) %>%
  ggplot(aes(x=value, fill=Attributes)) +
  geom_density(colour="black", alpha=0.5, show.legend=FALSE) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Density",
       title="Wines Attributes - Density plots") +
  theme_bw()
#箱形圖
wines %>%
  gather(Attributes, values, c(1:4, 6:12)) %>%
  ggplot(aes(x=reorder(Attributes, values, FUN=median), y=values, fill=Attributes)) +
  geom_boxplot(show.legend=FALSE) +
  labs(title="Wines Attributes - Boxplots") +
  theme_bw() +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank()) +
  ylim(0, 50) +
  coord_flip()
#相關系數圖
corrplot(cor(wines), type="upper", method="ellipse", tl.cex=0.9)
ggplot(wines, aes(x=Total_Phenols, y=Flavanoids)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Wines Attributes",
       subtitle="Relationship between Phenols and Flavanoids") +
  theme_bw()
#歸一化（標準化）使用scale函數
winesNorm <- as.data.frame(scale(wines))
#原始數據（節選）
p1 <- ggplot(wines, aes(x=Alcohol, y=Malic_Acid)) +
  geom_point() +
  labs(title="Original data") +
  theme_bw()
#歸一化后的數據
p2 <- ggplot(winesNorm, aes(x=Alcohol, y=Malic_Acid)) +
  geom_point() +
  labs(title="Normalized data") +
  theme_bw()
#畫圖
grid.arrange(p1, p2, ncol=2)
set.seed(6666)
wines_k2 <- kmeans(winesNorm, centers=2)
ggpairs(cbind(wines, Cluster=as.factor(wines_k2$cluster)),
        columns=1:6, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        upper=list(continuous="blank"),
        axisLabels="none", switch="both") +
        theme_bw()
#查看每個點被分成哪個集群
wines_k2$cluster
#查看聚類中心矩陣
wines_k2$centers
#數量
wines_k2$size
#組間平方和
wines_k2$betweenss
#每個集群的組內平方和
wines_k2$withinss
#組內平方和
wines_k2$tot.withinss
#總平方和
wines_k2$totss
bss <- numeric()
wss <- numeric()

# Run the algorithm for different values of k 
set.seed(6666)

for(i in 1:10){

  # For each k, calculate betweenss and tot.withinss
  bss[i] <- kmeans(winesNorm, centers=i)$betweenss
  wss[i] <- kmeans(winesNorm, centers=i)$tot.withinss

}

# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"), 
            xlab="Number of clusters", ylab="Between-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
            xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Subplot
grid.arrange(p3, p4, ncol=2)
set.seed(6666)
wines_k3 <- kmeans(winesNorm,centers = 3)
aggregate(wines, by = list(wines_k3$cluster), mean)
ggpairs(cbind(wines, Cluster=as.factor(wines_k3$cluster)),
        columns=1:6, aes(colour=Cluster, alpha=0.5),
        lower=list(continuous="points"),
        upper=list(continuous="blank"),
        axisLabels="none", switch="both") +
        theme_bw()

總結

以上是生活随笔為你收集整理的k-means的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

Means

上一篇：用SQL语句添加删除修改字段_常用SQL
下一篇： SCU 3132（博弈）