pytorch 基本用法：Kmeans

2020-03-09

两个类别
1. sklearn 版本
2. pytroch 版本
多类别，测试不同 K 值
1. 生成数据
2. 测试不同 k 值的分数
更多的类

1	torch.__version__

‘1.3.1+cpu’

两个类别

n_data = torch.ones(100, 2)
xy0 = torch.normal(2 * n_data, 1)  # 生成均值为2，2 标准差为1的随机数组成的矩阵 shape=(100, 2)
c0 = torch.zeros(100)
xy1 = torch.normal(-2 * n_data, 1) # 生成均值为-2，-2 标准差为1的随机数组成的矩阵 shape=(100, 2)
c1 = torch.ones(100)
X = torch.cat((xy0, xy1), 0)
c = torch.cat((c0,c1), 0)
plt.scatter(X[:,0],X[:,1], c=c, s=100, cmap='RdYlGn')
plt.show()
X.shape

数据分布

torch.Size([200, 2])

sklearn 版本

from sklearn.cluster import KMeans
from sklearn import metrics 
kmeans = KMeans(n_clusters=2)
y_pred = kmeans.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', s=80, alpha=.8)
plt.show()
# 查看聚类中心，并用 Calinski-Harabasz Index 评估聚类分数
kmeans.cluster_centers_, metrics.calinski_harabasz_score(X, y_pred)

预测结果

红点代表聚类中心

(array([[ 2.18905603, 1.94598238],
[-2.10763696, -1.98111874]]), 864.2674279012733)

pytroch 版本

# 代码修改自 https://www.jianshu.com/p/1c000d9296ae
class KMEANS:
    def __init__(self, n_clusters=20, max_iter=None, verbose=True,device = torch.device("cpu")):
        self.n_cluster = n_clusters
        self.n_clusters = n_clusters
        self.labels = None
        self.dists = None  # shape: [x.shape[0],n_cluster]
        self.centers = None
        self.variation = torch.Tensor([float("Inf")]).to(device)
        self.verbose = verbose
        self.started = False
        self.representative_samples = None
        self.max_iter = max_iter
        self.count = 0
        self.device = device

    def fit_predict(self, x):
        # 随机选择初始中心点，想更快的收敛速度可以借鉴sklearn中的kmeans++初始化方法
        init_row = torch.randint(0, x.shape[0], (self.n_clusters,)).to(self.device)
        init_points = x[init_row]
        self.centers = init_points
        while True:
            # 聚类标记
            self.nearest_center(x)
            # 更新中心点
            self.update_center(x)
            if self.verbose:
                print(self.variation, torch.argmin(self.dists, (0)))
            if torch.abs(self.variation) < 1e-3 and self.max_iter is None:
                break
            elif self.max_iter is not None and self.count == self.max_iter:
                break

            self.count += 1

        self.representative_sample()
        return self.labels

    def nearest_center(self, x):
        labels = torch.empty((x.shape[0],)).long().to(self.device)
        dists = torch.empty((0, self.n_clusters)).to(self.device)
        for i, sample in enumerate(x):
            dist = torch.sum(torch.mul(sample - self.centers, sample - self.centers), (1))
            labels[i] = torch.argmin(dist)
            dists = torch.cat([dists, dist.unsqueeze(0)], (0))
        self.labels = labels
        if self.started:
            self.variation = torch.sum(self.dists - dists)
        self.dists = dists
        self.started = True

    def update_center(self, x):
        centers = torch.empty((0, x.shape[1])).to(self.device)
        for i in range(self.n_clusters):
            mask = self.labels == i
            cluster_samples = x[mask]
            centers = torch.cat([centers, torch.mean(cluster_samples, (0)).unsqueeze(0)], (0))
        self.centers = centers

    def representative_sample(self):
        # 查找距离中心点最近的样本，作为聚类的代表样本，更加直观
        self.representative_samples = torch.argmin(self.dists, (0))

结果：

device = torch.device('cpu')
k = KMEANS(n_clusters=2, max_iter=10,verbose=False,device=device)
y_pred = k.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(k.centers[:, 0], k.centers[:, 1], c='red', s=80, alpha=.8)
representative_samples = x
plt.scatter(X[k.representative_samples][:, 0], X[k.representative_samples][:, 1], c='blue', s=80, alpha=.8)
plt.show()
k.centers, X[k.representative_samples] , metrics.calinski_harabasz_score(X, y_pred)

红点是聚类的中心，蓝点是该类别距离中心最近的点。

(tensor([[-2.1076, -1.9811],
[ 2.1891, 1.9460]]), tensor([[-2.1868, -2.1051],
[ 2.2467, 1.8789]]), 864.2674279012733)

多类别，测试不同 K 值

生成数据

import torch
import matplotlib.pyplot as plt


n_data = torch.ones(100, 2)
xy0 = torch.normal(2 * n_data, 1)  
xy1 = torch.normal(-2 * n_data, 1) 
xy2 = torch.normal(n_data * torch.tensor([-6, -1]), 1)
xy3 = torch.normal(n_data * 5, 1)
X = torch.cat((xy0, xy1, xy2, xy3), 0)
c = torch.tensor([0]*100+[1]*100+[2]*100+[3]*100)
plt.scatter(X[:,0],X[:,1], c=c, s=20, cmap='RdYlGn')
plt.show()
X.shape

torch.Size([400, 2])

测试不同 k 值的分数

先上结果

K 值	calinski_harabasz_score
2	1244.8720900338403
3	1103.8048055745687
4	1631.0687149071816
4（局部最优）	772.2547910253841
5	1342.9573128315687

某些初始值的选择可能使算法陷入局部最优，而无法到达全局最优

K = 2

K = 2
device = torch.device('cpu')
k = KMEANS(n_clusters=K, max_iter=10,verbose=False,device=device)
y_pred = k.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=20)
plt.scatter(k.centers[:, 0], k.centers[:, 1], c='red', s=80, alpha=.8)
representative_samples = x
plt.scatter(X[k.representative_samples][:, 0], X[k.representative_samples][:, 1], c='blue', s=80, alpha=.8)
plt.show()
k.centers, X[k.representative_samples] , metrics.calinski_harabasz_score(X, y_pred)

(tensor([[-3.9148, -1.5554],
[ 3.3981, 3.4571]]), tensor([[-3.7091, -1.2708],
[ 3.5593, 3.6728]]), 1244.8720900338403)

K = 3

K=3

(tensor([[ 4.8847, 4.8983],
[ 1.7364, 1.8475],
[-3.9680, -1.5762]]), tensor([[ 4.8437, 4.8178],
[ 1.8517, 1.6753],
[-4.2247, -1.2751]]), 1103.8048055745687)

K = 4

K=4(局部最优)

(tensor([[ 5.6401, 4.8637],
[ 4.0151, 4.9221],
[ 1.7591, 1.8439],
[-3.9527, -1.5693]]), tensor([[ 5.5032, 4.8176],
[ 4.1500, 4.9420],
[ 1.8517, 1.6753],
[-3.7091, -1.2708]]), 772.2547910253841)

K=4

(tensor([[-1.9604, -2.0495],
[ 4.8847, 4.8983],
[-5.8682, -1.0887],
[ 1.7835, 1.8935]]), tensor([[-2.0173, -2.1479],
[ 4.8437, 4.8178],
[-5.8178, -1.0374],
[ 1.8578, 2.0665]]), 1631.0687149071816)

K = 5

(tensor([[ 1.7743, 1.8489],
[-5.8682, -1.0887],
[ 3.9343, 4.8072],
[ 5.5917, 4.9437],
[-1.9604, -2.0495]]), tensor([[ 1.8517, 1.6753],
[-5.8178, -1.0374],
[ 3.8467, 4.7662],
[ 5.5032, 4.8176],
[-2.0173, -2.1479]]), 1342.9573128315687)

我的博客

pytorch 基本用法：Kmeans

两个类别

sklearn 版本

pytroch 版本

多类别，测试不同 K 值

生成数据

测试不同 k 值的分数

K = 2

K = 3

K = 4

K = 5

更多的类

About

Categories

Tags

Tag Cloud

Archives

Recents