import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder data = pd.read_csv('cpu.csv') print(data) df_X = data.iloc[:,:-1] df_y = data.iloc[:,-1] # # X = [100,200,6000,150,160,125] # # def scaler(data,min=0,max=10): # return ((data - data.min()) / (data.max() - data.min())) * (max - min) + min # # df_scaler = scaler(df_X,1,10) # print(df_X) # # X_scaled = (X - df_X.min()) / (df_X.max() - df_X.min()) * (10 - 0) + 0 # print(X_scaled) # # encode = LabelEncoder() # df_y = encode.fit_transform(df_y) # print(df_y) # # def xac_suat_lt(x, tb, ps): # return (1 / np.sqrt(2 * np.pi * (ps + 1e-9))) * np.exp(-(x - tb) ** 2 / (2 * (ps + 1e-9))) # # def naive(df,df_y, X): # classes = df_y # list_sx = {} # # for c in classes: # cur_clas = df # tb = cur_clas.mean() # ps = cur_clas.var() # xs = len(cur_clas) / len(df) # for i in range(len(X)): # xs *= xac_suat_lt(X[i], tb[i], ps[i]) # list_sx[c] = xs # # predicted_class = max(list_sx, key=list_sx.get) # return predicted_class, list_sx # # class_du_bao, xac_suat_lop = naive(df_scaler,df_y, X_scaled) # print("Xác suất từng lớp:", xac_suat_lop) # print(f"X thuộc: Loại {class_du_bao + 1}") # def euclidean(p1, p2): # return np.sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2))) # # def gop(X, k = 4): # np.random.seed(26) # centroids = df_X.sample(k).values.tolist() # prev_cluster = None # # for _ in range(100): # clusters = [[] for _ in range(k)] # current_cluster = [] # # for idx, row in df_X.iterrows(): # distances = [euclidean(row, c) for c in centroids] # clusters[np.argmin(distances)].append(row.values) # current_cluster.append(np.argmin(distances)) # # new_centroids = [] # for i in clusters: # if len(i) == 0: # new_centroids.append(df_X.sample(1).values[0]) # else: # new_centroids.append(np.mean(i, axis=0)) # # if prev_cluster is not None: # if prev_cluster == current_cluster: # break # # centroids = new_centroids # prev_cluster = current_cluster # return centroids, clusters # # # centroids, clusters = gop(X_scaled,4) # # for i, cluster in enumerate(clusters): # print(f"Cụm {i}: {len(cluster)} mẫu") # print(f"Tâm cụm {i}: {centroids[i]}") # # distances = [euclidean(X_scaled, c) for c in centroids] # cluster_x = np.argmin(distances) # print(f"\nMẫu X thuộc cụm: {cluster_x}") def cluster_distance(c1, c2, linkage="ward"): from scipy.spatial.distance import cdist if linkage == "single": return np.min(cdist(c1, c2)) elif linkage == "complete": return np.max(cdist(c1, c2)) elif linkage == "average": return np.mean(cdist(c1, c2)) elif linkage == "ward": m1, m2 = np.mean(c1, axis=0), np.mean(c2, axis=0) return np.linalg.norm(m1 - m2) else: raise ValueError("Unknown linkage!") def agglomerative_clustering_df(df, n_clusters=4, linkage="ward"): X = df.values clusters = [[x] for x in X] while len(clusters) > n_clusters: min_dist = float("inf") merge_idx = (0, 0) for i in range(len(clusters)): for j in range(i + 1, len(clusters)): dist = cluster_distance(np.array(clusters[i]), np.array(clusters[j]), linkage) if dist < min_dist: min_dist = dist merge_idx = (i, j) i, j = merge_idx new_cluster = clusters[i] + clusters[j] clusters.pop(j) clusters.pop(i) clusters.append(new_cluster) labels = np.zeros(len(X), dtype=int) for cluster_id, cluster_points in enumerate(clusters): for p in cluster_points: idx = np.where((X == p).all(axis=1))[0][0] labels[idx] = cluster_id return labels labels = agglomerative_clustering_df(df_X, n_clusters=4) import collections count_clusters = collections.Counter(labels) print("Số lượng mẫu mỗi cụm:") for cluster_id, count in count_clusters.items(): print(f"Cụm {cluster_id}: {count} mẫu")