关于k-means聚类的原理可以参考这篇博客:
https://blog.csdn.net/sinat_36710456/article/details/88019323
本篇只讨论基本的代码实现,由于只是对一维数组的聚类,距离公式上比较简单:distance = |a - b| 适合初学者理解最基本的原理
所谓一维数组
比如: [12, 3, 56, 89, 78, 2, 12, 45, 255, 236]
以下代码实现的是对一组数字的聚类
类别的个数可以设置,改变参数n的值即可
代码如下(建议从最下边的主函数开始看起):
import numpy as np
import matplotlib.pyplot as plt
from time import *
def dis(a, b):
return abs(a-b)
def take_first(list_):
return list_[0]
def plot_(centroids, groups_list):
for i in range(len(centroids)):
centroids[i] = int(centroids[i])
z = list(zip(centroids, groups_list))
z.sort(key=take_first)
for i in range(len(centroids)):
centroids[i] = z[i][0]
groups_list[i] = z[i][1]
plt.bar(np.arange(len(centroids)), groups_list, align='center', color='c', tick_label=centroids)
plt.title('Distribution ', fontsize=20)
plt.xlabel('Area_Categories ', fontsize=18)
plt.ylabel('Numbers', fontsize=18)
plt.tick_params(axis='both', labelsize=14)
plt.tight_layout()
for a, b in zip(np.arange(len(centroids)), groups_list):
plt.text(a, b, b, ha='center', va='baseline', fontsize=14, fontstyle='italic')
plt.show()
def do_kmeans(n, area_list, centroids):
loss = 0
groups = []
new_centroids = []
for i in range(n):
groups.append([])
new_centroids.append(0)
for area in area_list:
min_distance = 1000000
group_index = 0
for centroid_index, centroid in enumerate(centroids):
distance = dis(area, centroid)
if distance 1:
area_list.append(float(temp))
f.close()
centroid_indices = np.random.choice(len(area_list), n)
centroids = []
for centroid_index in centroid_indices:
centroids.append(area_list[centroid_index])
centroids, groups, old_loss = do_kmeans(n, area_list, centroids)
iterations = 1
i = 0
while True:
i = 1+i
centroids, groups, loss = do_kmeans(n, area_list, centroids)
iterations = iterations + 1
print("number:", i, "************loss = %f" % loss)
if abs(old_loss - loss) iterations_num:
break
old_loss = loss
for centroid in centroids:
print(centroid, '\n')
# print result
j = 0
groups_list = []
print("--------------------------------")
for centroid in centroids:
print("k-means result:")
print(centroid, "-------numbers:", len(groups[j]))
groups_list.append(len(groups[j]))
j = j+1
print("--------------------------------")
plot_(centroids, groups_list)
begin_time = time()
list_path = "G:/iSAID annotation/area.txt"
n = 5
loss_convergence = 1e-10
iterations_num = 500
print('**************Start**************')
compute_centroids(list_path, n, loss_convergence, iterations_num)
print('*********Already Finish*********\n')
end_time = time()
run_time = end_time-begin_time
print('运行时间:', run_time)