{"id":2775,"date":"2024-10-26T07:32:02","date_gmt":"2024-10-25T23:32:02","guid":{"rendered":"http:\/\/viplao.com\/?p=2775"},"modified":"2024-10-27T22:34:12","modified_gmt":"2024-10-27T14:34:12","slug":"%e6%95%b0%e6%8d%ae%e5%88%86%e6%9e%90-kmeans%e8%81%9a%e7%b1%bb%e5%88%86%e6%9e%90%e5%ae%9e%e8%b7%b5%e6%a1%88%e4%be%8b","status":"publish","type":"post","link":"http:\/\/viplao.com\/index.php\/2024\/10\/26\/%e6%95%b0%e6%8d%ae%e5%88%86%e6%9e%90-kmeans%e8%81%9a%e7%b1%bb%e5%88%86%e6%9e%90%e5%ae%9e%e8%b7%b5%e6%a1%88%e4%be%8b\/","title":{"rendered":"\u6570\u5b57\u5316\u8fd0\u8425\u57fa\u7840\u6280\u80fd &#8211; KMeans\u805a\u7c7b\u5206\u6790\u5b9e\u8df5\u6848\u4f8b"},"content":{"rendered":"\n<p>K\u5747\u503c\u805a\u7c7b<br>\u65e2\u7136\u662f\u805a\u7c7b\u561b\uff0c\u90a3\u80af\u5b9a\u5c31\u7528\u6700\u7ecf\u5178\u4e5f\u6bd4\u8f83\u7b80\u5355\u7684K\u5747\u503c\u805a\u7c7b\u65b9\u6cd5\u3002<\/p>\n\n\n\n<p>K-Means\u7b97\u6cd5\u662f\u4e00\u79cd\u65e0\u76d1\u7763\u7684\u5b66\u4e60\uff0c\u4e8b\u5148\u4e0d\u77e5\u9053\u7c7b\u522b\uff0c\u81ea\u52a8\u5c06\u76f8\u4f3c\u7684\u5bf9\u8c61\u5f52\u5230\u540c\u4e00\u4e2a\u7c07\u4e2d\u3002<\/p>\n\n\n\n<p>\u4e5f\u662f\u4e00\u79cd\u805a\u7c7b\u5206\u6790\uff08cluster analysis\uff09\u7684\u7b97\u6cd5\uff0c\u5176\u4e3b\u8981\u662f\u6765\u8ba1\u7b97\u6570\u636e\u805a\u96c6\u7684\u7b97\u6cd5\uff0c\u4e3b\u8981\u901a\u8fc7\u4e0d\u65ad\u5730\u53d6\u79bb\u805a\u7c7b\u4e2d\u5fc3\u70b9\u6700\u8fd1\u5747\u503c\u7684\u7b97\u6cd5\u3002<\/p>\n\n\n\n<p>\u539f\u7406\u5c31\u4e0d\u591a\u8bf4\u4e86\uff0c\u53cd\u6b63\u4e5f\u90fd\u662f\u7528sklearn\u7684\u5305\u5b9e\u73b0\u3002<\/p>\n\n\n\n<p>\u5bfc\u5165\u5e93<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import matplotlib.pyplot as plt  # \u56fe\u5f62\u5e93\nimport numpy as np\nimport pandas as pd\nfrom sklearn.metrics import silhouette_score  # \u5bfc\u5165\u8f6e\u5ed3\u7cfb\u6570\u6307\u6807\nfrom sklearn.cluster import KMeans  # KMeans\u6a21\u5757\nfrom sklearn.preprocessing import MinMaxScaler, OneHotEncoder  # \u6570\u636e\u9884\u5904\u7406\u5e93<\/code><\/pre>\n\n\n\n<p>\u8bfb\u53d6\u6570\u636e<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>raw_data = pd.read_table('sales.txt', delimiter='\\t')<\/code><\/pre>\n\n\n\n<p>\u6570\u636e \u9884\u5904\u7406<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># \u7f3a\u5931\u503c\u5ba1\u67e5\nna_cols = raw_data.isnull().any(axis=0)  # \u67e5\u770b\u6bcf\u4e00\u5217\u662f\u5426\u5177\u6709\u7f3a\u5931\u503c\nprint('{:*^60}'.format('NA Cols:'))\nprint(na_cols&#91;na_cols==True])  # \u67e5\u770b\u5177\u6709\u7f3a\u5931\u503c\u7684\u5217\nprint('Total number of NA lines is: {0}'.format(\n    raw_data.isnull().any(axis=1).sum()))  # \u67e5\u770b\u5177\u6709\u7f3a\u5931\u503c\u7684\u884c\u603b\u8bb0\u5f55\u6570\n\n# \u5220\u9664\u5e73\u5747\u5e73\u5747\u505c\u7559\u65f6\u95f4\u5217\nraw_data2 = raw_data.drop(&#91;'\u5e73\u5747\u505c\u7559\u65f6\u95f4'], axis=1)\n\n# \u5b57\u7b26\u4e32\u5206\u7c7bonehotencode\u5904\u7406\ncols = &#91;'\u7d20\u6750\u7c7b\u578b','\u5e7f\u544a\u7c7b\u578b','\u5408\u4f5c\u65b9\u5f0f','\u5e7f\u544a\u5c3a\u5bf8','\u5e7f\u544a\u5356\u70b9']\nmodel_ohe = OneHotEncoder(sparse=False)  # \u5efa\u7acbOneHotEncode\u5bf9\u8c61\nohe_matrix = model_ohe.fit_transform(raw_data2&#91;cols])  # \u76f4\u63a5\u8f6c\u6362\nprint(ohe_matrix&#91;:2])\n\n# \u6570\u636e\u6807\u51c6\u5316\nsacle_matrix = raw_data2.iloc&#91;:, 1:7]  # \u83b7\u5f97\u8981\u8f6c\u6362\u7684\u77e9\u9635\nmodel_scaler = MinMaxScaler()  # \u5efa\u7acbMinMaxScaler\u6a21\u578b\u5bf9\u8c61\ndata_scaled = model_scaler.fit_transform(sacle_matrix)  # MinMaxScaler\u6807\u51c6\u5316\u5904\u7406\nprint(data_scaled.round(2))\n\n# \u5408\u5e76\u6240\u6709\u7ef4\u5ea6\nX = np.hstack((data_scaled, ohe_matrix))<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<p>KMeans\u5efa\u6a21<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># \u901a\u8fc7\u5e73\u5747\u8f6e\u5ed3\u7cfb\u6570\u68c0\u9a8c\u5f97\u5230\u6700\u4f73KMeans\u805a\u7c7b\u6a21\u578b\nscore_list = list()  # \u7528\u6765\u5b58\u50a8\u6bcf\u4e2aK\u4e0b\u6a21\u578b\u7684\u5e73\u5c40\u8f6e\u5ed3\u7cfb\u6570\nsilhouette_int = -1  # \u521d\u59cb\u5316\u7684\u5e73\u5747\u8f6e\u5ed3\u7cfb\u6570\u9600\u503c\nfor n_clusters in range(2, 5):  # \u904d\u5386\u4ece2\u52305\u51e0\u4e2a\u6709\u9650\u7ec4\n    model_kmeans = KMeans(n_clusters=n_clusters)  # \u5efa\u7acb\u805a\u7c7b\u6a21\u578b\u5bf9\u8c61\n    labels_tmp = model_kmeans.fit_predict(X)  # \u8bad\u7ec3\u805a\u7c7b\u6a21\u578b\n    silhouette_tmp = silhouette_score(X, labels_tmp)  # \u5f97\u5230\u6bcf\u4e2aK\u4e0b\u7684\u5e73\u5747\u8f6e\u5ed3\u7cfb\u6570\n    if silhouette_tmp &gt; silhouette_int:  # \u5982\u679c\u5e73\u5747\u8f6e\u5ed3\u7cfb\u6570\u66f4\u9ad8\n        best_k = n_clusters  # \u4fdd\u5b58K\u5c06\u6700\u597d\u7684K\u5b58\u50a8\u4e0b\u6765\n        silhouette_int = silhouette_tmp  # \u4fdd\u5b58\u5e73\u5747\u8f6e\u5ed3\u5f97\u5206\n        best_kmeans = model_kmeans  # \u4fdd\u5b58\u6a21\u578b\u5b9e\u4f8b\u5bf9\u8c61\n        cluster_labels_k = labels_tmp  # \u4fdd\u5b58\u805a\u7c7b\u6807\u7b7e\n    score_list.append(&#91;n_clusters, silhouette_tmp])  # \u5c06\u6bcf\u6b21K\u53ca\u5176\u5f97\u5206\u8ffd\u52a0\u5230\u5217\u8868\nprint('{:*^60}'.format('K value and silhouette summary:'))\nprint(np.array(score_list))  # \u6253\u5370\u8f93\u51fa\u6240\u6709K\u4e0b\u7684\u8be6\u7ec6\u5f97\u5206\nprint('Best K is:{0} with average silhouette of {1}'.format(best_k, silhouette_int))<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<p>\u805a\u7c7b\u7ed3\u679c\u5206\u6790<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># part1 \u5c06\u539f\u59cb\u6570\u636e\u4e0e\u805a\u7c7b\u6807\u7b7e\u6574\u5408\ncluster_labels = pd.DataFrame(cluster_labels_k, columns=&#91;'clusters'])  # \u83b7\u5f97\u8bad\u7ec3\u96c6\u4e0b\u7684\u6807\u7b7e\u4fe1\u606f\nmerge_data = pd.concat((raw_data2, cluster_labels), axis=1)  # \u5c06\u539f\u59cb\u5904\u7406\u8fc7\u7684\u6570\u636e\u8ddf\u805a\u7c7b\u6807\u7b7e\u6574\u5408\n\n# part2 \u8ba1\u7b97\u6bcf\u4e2a\u805a\u7c7b\u7c7b\u522b\u4e0b\u7684\u6837\u672c\u91cf\u548c\u6837\u672c\u5360\u6bd4\nclustering_count = pd.DataFrame(merge_data&#91;'\u6e20\u9053\u4ee3\u53f7'].groupby(merge_data&#91;'clusters']).count()).T.rename({'\u6e20\u9053\u4ee3\u53f7': 'counts'})  # \u8ba1\u7b97\u6bcf\u4e2a\u805a\u7c7b\u7c7b\u522b\u7684\u6837\u672c\u91cf\nclustering_ratio = (clustering_count \/ len(merge_data)).round(2).rename({'counts': 'percentage'})  # \u8ba1\u7b97\u6bcf\u4e2a\u805a\u7c7b\u7c7b\u522b\u7684\u6837\u672c\u91cf\u5360\u6bd4\n\n# part3 \u8ba1\u7b97\u5404\u4e2a\u805a\u7c7b\u7c7b\u522b\u5185\u90e8\u6700\u663e\u8457\u7279\u5f81\u503c\ncluster_features = &#91;]  # \u7a7a\u5217\u8868\uff0c\u7528\u4e8e\u5b58\u50a8\u6700\u7ec8\u5408\u5e76\u540e\u7684\u6240\u6709\u7279\u5f81\u4fe1\u606f\nfor line in range(best_k):  # \u8bfb\u53d6\u6bcf\u4e2a\u7c7b\u7d22\u5f15\n    label_data = merge_data&#91;merge_data&#91;'clusters'] == line]  # \u83b7\u5f97\u7279\u5b9a\u7c7b\u7684\u6570\u636e\n\n    part1_data = label_data.iloc&#91;:, 1:7]  # \u83b7\u5f97\u6570\u503c\u578b\u6570\u636e\u7279\u5f81\n    part1_desc = part1_data.describe().round(3)  # \u5f97\u5230\u6570\u503c\u578b\u7279\u5f81\u7684\u63cf\u8ff0\u6027\u7edf\u8ba1\u4fe1\u606f\n    # merge_data1 = part1_desc.iloc&#91;2, :]  # \u8001\u7248\u672c\uff0c\u5f97\u5230\u6570\u503c\u578b\u7279\u5f81\u7684\u5747\u503c\n    merge_data1 = part1_desc.iloc&#91;1, :]  # \u65b0\u7248\u672c\uff0c\u5f97\u5230\u6570\u503c\u578b\u7279\u5f81\u7684\u5747\u503c\n\n    part2_data = label_data.iloc&#91;:, 7:-1]  # \u83b7\u5f97\u5b57\u7b26\u4e32\u578b\u6570\u636e\u7279\u5f81\n    part2_desc = part2_data.describe(include='all')  # \u83b7\u5f97\u5b57\u7b26\u4e32\u578b\u6570\u636e\u7279\u5f81\u7684\u63cf\u8ff0\u6027\u7edf\u8ba1\u4fe1\u606f\n    merge_data2 = part2_desc.iloc&#91;2, :]  # \u83b7\u5f97\u5b57\u7b26\u4e32\u578b\u6570\u636e\u7279\u5f81\u7684\u6700\u9891\u7e41\u503c\n\n    merge_line = pd.concat((merge_data1, merge_data2), axis=0)  # \u5c06\u6570\u503c\u578b\u548c\u5b57\u7b26\u4e32\u578b\u5178\u578b\u7279\u5f81\u6cbf\u884c\u5408\u5e76\n    cluster_features.append(merge_line)  # \u5c06\u6bcf\u4e2a\u7c7b\u522b\u4e0b\u7684\u6570\u636e\u7279\u5f81\u8ffd\u52a0\u5230\u5217\u8868\n# part4 \u8f93\u51fa\u5b8c\u6574\u7684\u7c7b\u522b\u7279\u5f81\u4fe1\u606f\ncluster_pd = pd.DataFrame(cluster_features).T  # \u5c06\u5217\u8868\u8f6c\u5316\u4e3a\u77e9\u9635\nprint('{:*^60}'.format('Detailed features for all clusters:'))\nall_cluster_set = pd.concat((clustering_count, clustering_ratio, cluster_pd),axis=0)  # \u5c06\u6bcf\u4e2a\u805a\u7c7b\u7c7b\u522b\u7684\u6240\u6709\u4fe1\u606f\u5408\u5e76\nprint(all_cluster_set)<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<p>\u5404\u7c7b\u522b\u663e\u8457\u6570\u503c\u7279\u5f81\u5bf9\u6bd4<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># part1 \u5404\u7c7b\u522b\u6570\u636e\u9884\u5904\u7406\nnum_sets = cluster_pd.iloc&#91;:6, :].T.astype(np.float64)  # \u83b7\u53d6\u8981\u5c55\u793a\u7684\u6570\u636e\nnum_sets_max_min = model_scaler.fit_transform(num_sets)  # \u83b7\u5f97\u6807\u51c6\u5316\u540e\u7684\u6570\u636e\n# part2 \u753b\u5e03\u57fa\u672c\u8bbe\u7f6e\nfig = plt.figure(figsize=(6,6))  # \u5efa\u7acb\u753b\u5e03\nax = fig.add_subplot(111, polar=True)  # \u589e\u52a0\u5b50\u7f51\u683c\uff0c\u6ce8\u610fpolar\u53c2\u6570\nlabels = np.array(merge_data1.index)  # \u8bbe\u7f6e\u8981\u5c55\u793a\u7684\u6570\u636e\u6807\u7b7e\ncor_list = &#91;'b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']  # \u5b9a\u4e49\u4e0d\u540c\u7c7b\u522b\u7684\u989c\u8272\nangles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False)  # \u8ba1\u7b97\u5404\u4e2a\u533a\u95f4\u7684\u89d2\u5ea6\nangles = np.concatenate((angles, &#91;angles&#91;0]]))  # \u5efa\u7acb\u76f8\u540c\u9996\u5c3e\u5b57\u6bb5\u4ee5\u4fbf\u4e8e\u95ed\u5408\nlabels = np.concatenate((labels,&#91;labels&#91;0]]))   # \u65b0\u7248\u672c\u589e\u52a0\uff0c\u5bf9labels\u8fdb\u884c\u5c01\u95ed\n# part3 \u753b\u96f7\u8fbe\u56fe\nfor i in range(len(num_sets)):  # \u5faa\u73af\u6bcf\u4e2a\u7c7b\u522b\n    data_tmp = num_sets_max_min&#91;i, :]  # \u83b7\u5f97\u5bf9\u5e94\u7c7b\u6570\u636e\n    data = np.concatenate((data_tmp, &#91;data_tmp&#91;0]]))  # \u5efa\u7acb\u76f8\u540c\u9996\u5c3e\u5b57\u6bb5\u4ee5\u4fbf\u4e8e\u95ed\u5408\n    ax.plot(angles, data, 'o-', c=cor_list&#91;i], label=i)  # \u753b\u7ebf\n# part4 \u8bbe\u7f6e\u56fe\u50cf\u663e\u793a\u683c\u5f0f\nax.set_thetagrids(angles * 180 \/ np.pi, labels, fontproperties=\"SimHei\")  # \u8bbe\u7f6e\u6781\u5750\u6807\u8f74\nax.set_title(\"\u5404\u805a\u7c7b\u7c7b\u522b\u663e\u8457\u7279\u5f81\u5bf9\u6bd4\", fontproperties=\"SimHei\")  # \u8bbe\u7f6e\u6807\u9898\u653e\u7f6e\nax.set_rlim(-0.2, 1.2)  # \u8bbe\u7f6e\u5750\u6807\u8f74\u5c3a\u5ea6\u8303\u56f4\nplt.legend(loc=0)  # \u8bbe\u7f6e\u56fe\u4f8b\u4f4d\u7f6e\n<\/code><\/pre>\n\n\n\n<p><\/p>\n\n\n\n<p><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img decoding=\"async\" loading=\"lazy\" width=\"852\" height=\"741\" src=\"http:\/\/viplao.com\/wp-content\/uploads\/2024\/10\/image-24.png\" alt=\"\" class=\"wp-image-2776\" srcset=\"http:\/\/viplao.com\/wp-content\/uploads\/2024\/10\/image-24.png 852w, http:\/\/viplao.com\/wp-content\/uploads\/2024\/10\/image-24-300x261.png 300w, http:\/\/viplao.com\/wp-content\/uploads\/2024\/10\/image-24-768x668.png 768w, http:\/\/viplao.com\/wp-content\/uploads\/2024\/10\/image-24-345x300.png 345w\" sizes=\"(max-width: 852px) 100vw, 852px\" \/><\/figure>\n","protected":false},"excerpt":{"rendered":"<p>K\u5747\u503c\u805a\u7c7b\u65e2\u7136\u662f\u805a\u7c7b\u561b\uff0c\u90a3\u80af\u5b9a\u5c31\u7528\u6700\u7ecf\u5178\u4e5f\u6bd4\u8f83\u7b80\u5355\u7684K\u5747\u503c\u805a\u7c7b\u65b9\u6cd5\u3002 K-Means\u7b97\u6cd5\u662f\u4e00\u79cd\u65e0\u76d1\u7763&hellip; <a href=\"http:\/\/viplao.com\/index.php\/2024\/10\/26\/%e6%95%b0%e6%8d%ae%e5%88%86%e6%9e%90-kmeans%e8%81%9a%e7%b1%bb%e5%88%86%e6%9e%90%e5%ae%9e%e8%b7%b5%e6%a1%88%e4%be%8b\/\" class=\"more-link read-more\" rel=\"bookmark\">\u7ee7\u7eed\u9605\u8bfb <span class=\"screen-reader-text\">\u6570\u5b57\u5316\u8fd0\u8425\u57fa\u7840\u6280\u80fd &#8211; KMeans\u805a\u7c7b\u5206\u6790\u5b9e\u8df5\u6848\u4f8b<\/span><i class=\"fa fa-arrow-right\"><\/i><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[1],"tags":[28],"views":502,"_links":{"self":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/2775"}],"collection":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/comments?post=2775"}],"version-history":[{"count":3,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/2775\/revisions"}],"predecessor-version":[{"id":2805,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/posts\/2775\/revisions\/2805"}],"wp:attachment":[{"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/media?parent=2775"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/categories?post=2775"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/viplao.com\/index.php\/wp-json\/wp\/v2\/tags?post=2775"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}