我的博客

pytorch 基本用法:Kmeans

目录
  1. 两个类别
    1. sklearn 版本
    2. pytroch 版本
  2. 多类别,测试不同 K 值
    1. 生成数据
    2. 测试不同 k 值的分数
      1. K = 2
      2. K = 3
      3. K = 4
      4. K = 5
  3. 更多的类
1
torch.__version__

‘1.3.1+cpu’

两个类别

1
2
3
4
5
6
7
8
9
10
n_data = torch.ones(100, 2)
xy0 = torch.normal(2 * n_data, 1) # 生成均值为2,2 标准差为1的随机数组成的矩阵 shape=(100, 2)
c0 = torch.zeros(100)
xy1 = torch.normal(-2 * n_data, 1) # 生成均值为-2,-2 标准差为1的随机数组成的矩阵 shape=(100, 2)
c1 = torch.ones(100)
X = torch.cat((xy0, xy1), 0)
c = torch.cat((c0,c1), 0)
plt.scatter(X[:,0],X[:,1], c=c, s=100, cmap='RdYlGn')
plt.show()
X.shape

数据分布

torch.Size([200, 2])

sklearn 版本

1
2
3
4
5
6
7
8
9
from sklearn.cluster import KMeans
from sklearn import metrics
kmeans = KMeans(n_clusters=2)
y_pred = kmeans.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', s=80, alpha=.8)
plt.show()
# 查看聚类中心,并用 Calinski-Harabasz Index 评估聚类分数
kmeans.cluster_centers_, metrics.calinski_harabasz_score(X, y_pred)

预测结果

红点代表聚类中心

(array([[ 2.18905603, 1.94598238],
[-2.10763696, -1.98111874]]), 864.2674279012733)

pytroch 版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# 代码修改自 https://www.jianshu.com/p/1c000d9296ae
class KMEANS:
def __init__(self, n_clusters=20, max_iter=None, verbose=True,device = torch.device("cpu")):
self.n_cluster = n_clusters
self.n_clusters = n_clusters
self.labels = None
self.dists = None # shape: [x.shape[0],n_cluster]
self.centers = None
self.variation = torch.Tensor([float("Inf")]).to(device)
self.verbose = verbose
self.started = False
self.representative_samples = None
self.max_iter = max_iter
self.count = 0
self.device = device

def fit_predict(self, x):
# 随机选择初始中心点,想更快的收敛速度可以借鉴sklearn中的kmeans++初始化方法
init_row = torch.randint(0, x.shape[0], (self.n_clusters,)).to(self.device)
init_points = x[init_row]
self.centers = init_points
while True:
# 聚类标记
self.nearest_center(x)
# 更新中心点
self.update_center(x)
if self.verbose:
print(self.variation, torch.argmin(self.dists, (0)))
if torch.abs(self.variation) < 1e-3 and self.max_iter is None:
break
elif self.max_iter is not None and self.count == self.max_iter:
break

self.count += 1

self.representative_sample()
return self.labels

def nearest_center(self, x):
labels = torch.empty((x.shape[0],)).long().to(self.device)
dists = torch.empty((0, self.n_clusters)).to(self.device)
for i, sample in enumerate(x):
dist = torch.sum(torch.mul(sample - self.centers, sample - self.centers), (1))
labels[i] = torch.argmin(dist)
dists = torch.cat([dists, dist.unsqueeze(0)], (0))
self.labels = labels
if self.started:
self.variation = torch.sum(self.dists - dists)
self.dists = dists
self.started = True

def update_center(self, x):
centers = torch.empty((0, x.shape[1])).to(self.device)
for i in range(self.n_clusters):
mask = self.labels == i
cluster_samples = x[mask]
centers = torch.cat([centers, torch.mean(cluster_samples, (0)).unsqueeze(0)], (0))
self.centers = centers

def representative_sample(self):
# 查找距离中心点最近的样本,作为聚类的代表样本,更加直观
self.representative_samples = torch.argmin(self.dists, (0))

结果:

1
2
3
4
5
6
7
8
9
device = torch.device('cpu')
k = KMEANS(n_clusters=2, max_iter=10,verbose=False,device=device)
y_pred = k.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(k.centers[:, 0], k.centers[:, 1], c='red', s=80, alpha=.8)
representative_samples = x
plt.scatter(X[k.representative_samples][:, 0], X[k.representative_samples][:, 1], c='blue', s=80, alpha=.8)
plt.show()
k.centers, X[k.representative_samples] , metrics.calinski_harabasz_score(X, y_pred)

image.png

红点是聚类的中心,蓝点是该类别距离中心最近的点。

(tensor([[-2.1076, -1.9811],
[ 2.1891, 1.9460]]), tensor([[-2.1868, -2.1051],
[ 2.2467, 1.8789]]), 864.2674279012733)

多类别,测试不同 K 值

生成数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import torch
import matplotlib.pyplot as plt


n_data = torch.ones(100, 2)
xy0 = torch.normal(2 * n_data, 1)
xy1 = torch.normal(-2 * n_data, 1)
xy2 = torch.normal(n_data * torch.tensor([-6, -1]), 1)
xy3 = torch.normal(n_data * 5, 1)
X = torch.cat((xy0, xy1, xy2, xy3), 0)
c = torch.tensor([0]*100+[1]*100+[2]*100+[3]*100)
plt.scatter(X[:,0],X[:,1], c=c, s=20, cmap='RdYlGn')
plt.show()
X.shape

image.png

torch.Size([400, 2])

测试不同 k 值的分数

先上结果

K 值 calinski_harabasz_score
2 1244.8720900338403
3 1103.8048055745687
4 1631.0687149071816
4(局部最优) 772.2547910253841
5 1342.9573128315687

某些初始值的选择可能使算法陷入局部最优,而无法到达全局最优

K = 2

1
2
3
4
5
6
7
8
9
10
K = 2
device = torch.device('cpu')
k = KMEANS(n_clusters=K, max_iter=10,verbose=False,device=device)
y_pred = k.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(k.centers[:, 0], k.centers[:, 1], c='red', s=80, alpha=.8)
representative_samples = x
plt.scatter(X[k.representative_samples][:, 0], X[k.representative_samples][:, 1], c='blue', s=80, alpha=.8)
plt.show()
k.centers, X[k.representative_samples] , metrics.calinski_harabasz_score(X, y_pred)

(tensor([[-3.9148, -1.5554],
[ 3.3981, 3.4571]]), tensor([[-3.7091, -1.2708],
[ 3.5593, 3.6728]]), 1244.8720900338403)

image.png

K = 3

K=3

(tensor([[ 4.8847, 4.8983],
[ 1.7364, 1.8475],
[-3.9680, -1.5762]]), tensor([[ 4.8437, 4.8178],
[ 1.8517, 1.6753],
[-4.2247, -1.2751]]), 1103.8048055745687)

K = 4

K=4(局部最优)

(tensor([[ 5.6401, 4.8637],
[ 4.0151, 4.9221],
[ 1.7591, 1.8439],
[-3.9527, -1.5693]]), tensor([[ 5.5032, 4.8176],
[ 4.1500, 4.9420],
[ 1.8517, 1.6753],
[-3.7091, -1.2708]]), 772.2547910253841)

K=4

(tensor([[-1.9604, -2.0495],
[ 4.8847, 4.8983],
[-5.8682, -1.0887],
[ 1.7835, 1.8935]]), tensor([[-2.0173, -2.1479],
[ 4.8437, 4.8178],
[-5.8178, -1.0374],
[ 1.8578, 2.0665]]), 1631.0687149071816)

K = 5

image.png

(tensor([[ 1.7743, 1.8489],
[-5.8682, -1.0887],
[ 3.9343, 4.8072],
[ 5.5917, 4.9437],
[-1.9604, -2.0495]]), tensor([[ 1.8517, 1.6753],
[-5.8178, -1.0374],
[ 3.8467, 4.7662],
[ 5.5032, 4.8176],
[-2.0173, -2.1479]]), 1342.9573128315687)

更多的类

1
2
3
4
5
6
7
8
N, D, K = 10000, 2, 50
x = torch.randn(N, D) / 6 + .5
kmeans = KMeans(n_clusters=50)
y_pred = kmeans.fit_predict(x)
plt.figure(figsize=(8,8))
plt.scatter(x[:, 0].cpu(), x[:, 1].cpu(), c=y_pred, s= 30000 / len(x), cmap="tab10")
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='black', s=50, alpha=.8)
plt.axis([0,1,0,1]) ; plt.tight_layout() ; plt.show()

image.png

参考:

https://www.jianshu.com/p/1c000d9296ae

https://www.jianshu.com/p/4902096f6d4b

https://www.kernel-operations.io/keops/_auto_tutorials/kmeans/plot_kmeans_torch.html

评论无需登录,可以匿名,欢迎评论!