k-means
https://www.cnblogs.com/zy230530/p/7029025.html
k-means算法中的k表示聚類為k個簇,means代表取每一個聚類中數據的均值作為該簇的中心(質心)即用每一個類的質心對該簇進行描述。k-means算法的原理比較簡單,但它有缺陷,即其可能收斂到局部最優解(局部最優不如全局最優效果好),且在大規模數據集上收斂速度相對較慢。換種說法,k-means算法是受初始值影響的局部最優的迭代算法。偽代碼實現:
創建k個初始值作為初始質心(要位于數據邊界內)
if 任意一個點的簇分配結果發生改變:
遍歷數據集中的每一個點:
遍歷k個質心:
計算質心與數據點之間的距離
將數據點分配到距離其最近的質心的簇
遍歷k個簇:
計算每個簇中所有點的均值
得到的k個均值更新為新的質心
https://www.cnblogs.com/pinard/p/6169370.html
R語言代碼
library(tidyverse)
library(corrplot)
library(gridExtra)
library(GGally)
library(knitr)
wines <- read.csv('/home/zwt/PycharmProjects/test/data/wine.data')
wines <- wines[2:14] #刪除第一列,種類列
head(wines)
tail(wines)
summary(wines)
str(wines)
#畫每個屬性的直方圖
wines %>%
gather(Attributes, value, 1:13) %>%
ggplot(aes(x=value, fill=Attributes)) +
geom_histogram(colour="black", show.legend=FALSE) +
facet_wrap(~Attributes, scales="free_x") +
labs(x="Values", y="Frequency",
title="Wines Attributes - Histograms") +
theme_bw()
#密度圖
wines %>%
gather(Attributes, value, 1:13) %>%
ggplot(aes(x=value, fill=Attributes)) +
geom_density(colour="black", alpha=0.5, show.legend=FALSE) +
facet_wrap(~Attributes, scales="free_x") +
labs(x="Values", y="Density",
title="Wines Attributes - Density plots") +
theme_bw()
#箱形圖
wines %>%
gather(Attributes, values, c(1:4, 6:12)) %>%
ggplot(aes(x=reorder(Attributes, values, FUN=median), y=values, fill=Attributes)) +
geom_boxplot(show.legend=FALSE) +
labs(title="Wines Attributes - Boxplots") +
theme_bw() +
theme(axis.title.y=element_blank(),
axis.title.x=element_blank()) +
ylim(0, 50) +
coord_flip()
#相關系數圖
corrplot(cor(wines), type="upper", method="ellipse", tl.cex=0.9)
ggplot(wines, aes(x=Total_Phenols, y=Flavanoids)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title="Wines Attributes",
subtitle="Relationship between Phenols and Flavanoids") +
theme_bw()
#歸一化(標準化)使用scale函數
winesNorm <- as.data.frame(scale(wines))
#原始數據(節選)
p1 <- ggplot(wines, aes(x=Alcohol, y=Malic_Acid)) +
geom_point() +
labs(title="Original data") +
theme_bw()
#歸一化后的數據
p2 <- ggplot(winesNorm, aes(x=Alcohol, y=Malic_Acid)) +
geom_point() +
labs(title="Normalized data") +
theme_bw()
#畫圖
grid.arrange(p1, p2, ncol=2)
set.seed(6666)
wines_k2 <- kmeans(winesNorm, centers=2)
ggpairs(cbind(wines, Cluster=as.factor(wines_k2$cluster)),
columns=1:6, aes(colour=Cluster, alpha=0.5),
lower=list(continuous="points"),
upper=list(continuous="blank"),
axisLabels="none", switch="both") +
theme_bw()
#查看每個點被分成哪個集群
wines_k2$cluster
#查看聚類中心矩陣
wines_k2$centers
#數量
wines_k2$size
#組間平方和
wines_k2$betweenss
#每個集群的組內平方和
wines_k2$withinss
#組內平方和
wines_k2$tot.withinss
#總平方和
wines_k2$totss
bss <- numeric()
wss <- numeric()
# Run the algorithm for different values of k
set.seed(6666)
for(i in 1:10){
# For each k, calculate betweenss and tot.withinss
bss[i] <- kmeans(winesNorm, centers=i)$betweenss
wss[i] <- kmeans(winesNorm, centers=i)$tot.withinss
}
# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Between-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Subplot
grid.arrange(p3, p4, ncol=2)
set.seed(6666)
wines_k3 <- kmeans(winesNorm,centers = 3)
aggregate(wines, by = list(wines_k3$cluster), mean)
ggpairs(cbind(wines, Cluster=as.factor(wines_k3$cluster)),
columns=1:6, aes(colour=Cluster, alpha=0.5),
lower=list(continuous="points"),
upper=list(continuous="blank"),
axisLabels="none", switch="both") +
theme_bw()
總結
- 上一篇: 用SQL语句添加删除修改字段_常用SQL
- 下一篇: SCU 3132(博弈)