研究目的

因公司需要對一些內部資料進行分析,想透過KMC演算法來找出一群資料裡面的主分類。
借重scikit-learn套件提供各種機器學習的工具(包含KMC演算法),進行實作。

1
2
# 安裝scikit-learn套件
pip install -U scikit-learn

建立18組測試集合(3組調變參數:樣本數量、特徵數量、分群數量)進行測試。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
test_case_configs = (
{'sample_size': 100, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 100, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 100, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 1000, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 1000, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 1000, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 10000, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 10000, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 10000, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 100, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 100, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 100, 'sample_features': 100, 'cluster_num': 12},

{'sample_size': 1000, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 1000, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 1000, 'sample_features': 100, 'cluster_num': 12},

{'sample_size': 10000, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 10000, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 10000, 'sample_features': 100, 'cluster_num': 12},

)

測試結果

每次測試會生成Elbow圖、原始樣本分佈圖與各k值分佈圖。

100樣本,2特徵,6群

100樣本,10特徵,6群

100樣本,100特徵,6群

1000樣本,2特徵,6群

1000樣本,10特徵,6群

1000樣本,100特徵,6群

10000樣本,2特徵,6群

10000樣本,10特徵,6群

10000樣本,100特徵,6群

100樣本,2特徵,12群

100樣本,10特徵,12群

100樣本,100特徵,12群

1000樣本,2特徵,12群

1000樣本,10特徵,12群

1000樣本,100特徵,12群

10000樣本,2特徵,12群

10000樣本,10特徵,12群

程式碼

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs


class LTKmeans:
"""
test_case_01流程:
1. 產生樣本資料: generate_test_data_set()
2. 繪製樣本資料: plot_data_set()
3. 計算SSE(sum of the squared errors,誤差平方和): calculate_sse()
4. 繪製SSE: plot_sse()
5. 計算與繪製分群結果: plot_foreach_kmeans_cluster()
6. 顯示圖形: show()

"""
SAMPLE_SIZE = 0 # 樣本數量
SAMPLE_FEATURES = 0 # 特徵數量
SAMPLE_CLUSTER_NUM = 0 # 分群數量
K_MAX = 0 # K最大值
SUBPLOT_COL = 4 # 幾列圖
SUBPLOT_ROW = 0 # 幾行圖
figsize = [16, 16]

data_set = None
data_tags = None
sse = []
figure = None

def __init__(self,
title="K-means",
sample_size=1000,
sample_features=20,
cluster_num=5,
):
self.config(sample_size=sample_size,
sample_features=sample_features,
cluster_num=cluster_num,
print_config=False,
)
self._init_figure(title=title)

def in_notebook(self):
try:
from IPython import get_ipython
if 'IPKernelApp' not in get_ipython().config: # pragma: no cover
return False
except ImportError:
return False
except AttributeError:
return False
return True

def _init_figure(self, title):
plt.rcParams["figure.figsize"] = self.figsize
plt.rcParams["figure.autolayout"] = True
if not self.in_notebook():
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # 解決中文無法顯示
self.figure = plt.figure()
self.figure.canvas.manager.set_window_title(title)

def config(self,
sample_size=1000,
sample_features=20,
cluster_num=5,
print_config=False,
):
self.SAMPLE_CLUSTER_NUM = cluster_num
self.K_MAX = self.SAMPLE_CLUSTER_NUM + 3
self.SAMPLE_SIZE = sample_size
self.SAMPLE_FEATURES = sample_features
self.SUBPLOT_ROW = int(self.K_MAX / self.SUBPLOT_COL) + 2 # 幾行圖

if print_config:
print(f"""
Config:
樣本數: {self.SAMPLE_SIZE}
樣本特徵數(維度): {self.SAMPLE_FEATURES}
聚類數: {self.SAMPLE_CLUSTER_NUM}
""")

def generate_test_data_set(self):
# make_blobs 是 Scikit-learn 中的一個函數,用於生成聚類測試數據。
# 它可以根據用戶指定的中心點和標準差生成指定數量的樣本數據。
# 這個函數通常用於測試和驗證聚類算法的性能,以及可視化聚類算法的結果。
self.data_set, self.data_tags = make_blobs(n_samples=self.SAMPLE_SIZE, # 樣本資料
n_features=self.SAMPLE_FEATURES, # 特徵數(維度)
centers=self.SAMPLE_CLUSTER_NUM, # 分群數
# random_state=42 # 隨機數,設定為 42,以便每次執行時都能得到相同的結果
)


def plot_data_set(self):
ax = self.figure.add_subplot(self.SUBPLOT_ROW, self.SUBPLOT_COL, 3, )

ax.set_title(f'Sample data appearance({self.SAMPLE_CLUSTER_NUM} groups)')
ax.scatter(self.data_set.T[0],
self.data_set.T[1],
s=4,
# s 參數:指定每個資料點的大小。
c=self.data_tags,
# c 參數:指定每個資料點的顏色。可以使用以下值:
# 字符串:指定所有資料點的相同顏色,例如 'red' 或 'blue'。
# 數字陣列:指定每個資料點的顏色,例如 [0, 1, 2, 3]。
# 浮點數陣列:指定每個資料點的顏色,例如 [0.1, 0.2, 0.3, 0.4]。
# 字符串或數字陣列的序列:指定每個資料點的不同顏色,例如 ['red', 'blue', 'green', 'yellow'] 或 [0, 1, 0, 1]。
# 函數:根據資料點的值計算顏色,例如 lambda x: 'red' if x > 0 else 'blue'。
cmap=plt.cm.Set1,
# cmap 參數:指定顏色地圖。可以使用以下值:
# 字符串:指定預定義的顏色地圖,例如 'viridis'、'plasma'、'magma' 或 'inferno'。
# matplotlib.colors.Colormap 物件:自定義顏色地圖。
) # 根據重新分成的 5 組來畫出資料

ax = self.figure.add_subplot(self.SUBPLOT_ROW, self.SUBPLOT_COL, 4)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.text(0.02, 0.9, # sse.nlargest().tolist()[2],
f'Sample data size: {self.SAMPLE_SIZE}\nSample features(dimension): {self.SAMPLE_FEATURES}\nSample group size: {self.SAMPLE_CLUSTER_NUM}',
ha='left',
va='top',
transform=ax.transAxes,
fontsize=12)

def calculate_sse(self):
# 嘗試不同的聚類數量,並計算 SSE
self.sse.clear()
for k in range(1, self.K_MAX + 1):
kmeans = KMeans(
n_clusters=k,
# n_clusters 參數:指定要聚類的簇數。例如,如果 n_clusters=3,則會將資料分為 3 個聚類。
n_init=100
# 提問: n_init需要多少次?準確定才會高?
# n_init 參數:指定在不同的隨機初始中心點下運行 K-means 演算法的次數。由於 K-means 演算法的結果取決於初始中心點的位置,因此運行演算法多次可以提高模型的穩定性和準確性。
)
kmeans.fit(self.data_set)
self.sse.append(kmeans.inertia_)

def plot_sse(self):
pd_sse = pd.Series(self.sse)

ax = self.figure.add_subplot(self.SUBPLOT_ROW,
self.SUBPLOT_COL,
(1, 2))

# 繪製手肘圖
ax.plot(range(1, self.K_MAX + 1), pd_sse.tolist(),
color='blue',
marker='o',
linewidth=1,
markersize=4)
ax.set_title('Elbow Method')
ax.set_xlabel('K (number of clusters)')
# ax.set_ylabel('SSE')

def plot_foreach_kmeans_cluster(self):
for k in range(1, self.K_MAX + 1):
kmeans = KMeans(n_clusters=k, n_init=10)
kmeans.fit(self.data_set) # 對資料進行聚類
new_dy = kmeans.predict(self.data_set) # 預測新的目標值 label
# print("=new_dy",new_dy)

ax = self.figure.add_subplot(self.SUBPLOT_ROW,
self.SUBPLOT_COL,
k + 4)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_title(f'{k} groups')
ax.scatter(self.data_set.T[0],
self.data_set.T[1],
s=4,
c=new_dy,
cmap=plt.cm.Set1) # 顯示圖表

def show(self):
if self.in_notebook():
plt.draw()
else:
# import matplotlib
# matplotlib.use('TkAgg')
# figmgr = plt.get_current_fig_manager()
# bkend = plt.get_backend()
# if bkend == 'TkAgg':
# # figmgr.resize(*figmgr.window.maxsize())
# figmgr.window.state('zoomed')
# elif bkend == 'wxAgg':
# figmgr.frame.Maximize(True)
# else:
# figmgr.window.showMaximized()

plt.show()


def run_test_case_01(self,
sample_size,
sample_features,
cluster_num):
self.config(sample_size, sample_features, cluster_num)
self.generate_test_data_set()
self.plot_data_set()
self.calculate_sse()
self.plot_sse()
self.plot_foreach_kmeans_cluster()
self.show()


test_case_configs = (
{'sample_size': 100, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 100, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 100, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 1000, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 1000, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 1000, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 10000, 'sample_features': 2, 'cluster_num': 6},
{'sample_size': 10000, 'sample_features': 10, 'cluster_num': 6},
{'sample_size': 10000, 'sample_features': 100, 'cluster_num': 6},

{'sample_size': 100, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 100, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 100, 'sample_features': 100, 'cluster_num': 12},

{'sample_size': 1000, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 1000, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 1000, 'sample_features': 100, 'cluster_num': 12},

{'sample_size': 10000, 'sample_features': 2, 'cluster_num': 12},
{'sample_size': 10000, 'sample_features': 10, 'cluster_num': 12},
{'sample_size': 10000, 'sample_features': 100, 'cluster_num': 12},

)

for config in test_case_configs:
LTKmeans().run_test_case_01(**config)

學習資源