Source code for permetrics.utils.cluster_util

# !/usr/bin/env python
# Created by "Matt Q." at 23:05, 27/10/2022 --------%
#       Github: https://github.com/N3uralN3twork    %
#                                                   %
# Improved by: "Thieu" at 17:10, 25/07/2023 --------%
#       Email: nguyenthieu2102@gmail.com            %
#       Github: https://github.com/thieu1995        %
# --------------------------------------------------%

import numpy as np
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.spatial import distance_matrix
from scipy.stats import entropy as calculate_entropy


[docs]def compute_clusters(labels): """ Get the dict of clusters and dict of cluster size """ dict_clusters = {} for idx, label in enumerate(labels): if label in dict_clusters: dict_clusters[label].append(idx) else: dict_clusters[label] = [idx] dict_cluster_sizes = {} for label, group in dict_clusters.items(): dict_cluster_sizes[label] = len(group) return dict_clusters, dict_cluster_sizes
[docs]def compute_barycenters(X, labels): """ Get the barycenter for each cluster and barycenter for all observations Args: X (np.ndarray): The features of datasets labels (np.ndarray): The predicted labels Returns: barycenters (np.ndarray): The barycenter for each clusters in form of matrix overall_barycenter (np.ndarray): the barycenter for all observations """ n_samples, n_features = X.shape list_clusters = np.unique(labels) ## Mask mapping each class to its members. centroids = np.empty((len(list_clusters), n_features), dtype=np.float64) for idx, k in enumerate(list_clusters): centroid_mask = labels == k centroids[idx] = X[centroid_mask].mean(axis=0) return centroids, np.mean(X, axis=0)
[docs]def compute_WG(X): # Compute the scatter matrix WG using Eq.11 # Centering the column vectors means = np.mean(X, axis=0) centered_X = X - means # Computing the scatter matrix scatter_matrix = np.dot(centered_X.T, centered_X) return scatter_matrix
[docs]def compute_TSS(X): # The total scattering TSS (total sum of squares) # Computing the scatter matrix scatter_matrix = compute_WG(X) # Computing the total sum of squares (TSS) TSS = np.trace(scatter_matrix) return TSS
[docs]def compute_WGSS(X, labels): """ Calculate the pooled within-cluster sum of squares WGSS """ clusters_dict, cluster_sizes_dict = compute_clusters(labels) wg = [] for label, indices in clusters_dict.items(): scatter_mat = compute_WG(X[indices]) wg.append(np.trace(scatter_mat)) return np.sum(wg)
[docs]def compute_BGSS(X, labels): """ The between-group dispersion BGSS """ barycenters, overall_barycenter = compute_barycenters(X, labels) clusters, cluster_sizes = compute_clusters(labels) dispersion = 0 for label, indices in clusters.items(): diff = barycenters[label] - overall_barycenter dispersion += cluster_sizes[label] * np.sum(diff**2) return dispersion
[docs]def get_min_dist(X, centers): """ Get the min distance from samples X to centers """ dist = cdist(X, centers, metric='euclidean') min_dist = np.min(dist, axis=1) return min_dist
[docs]def get_centroids(X, labels): """ Calculates the centroids from the data given, for each class. Args: X (pd.DataFrame, np.ndarray): The original data that was clustered labels (list, np.ndarray): The predicted cluster assignment values Returns: centroids (np.ndarray): The centroids given the input data and labels """ n_samples, n_features = X.shape n_classes = len(np.unique(labels)) # * Mask mapping each class to its members. centroids = np.empty((n_classes, n_features), dtype=np.float64) # * Number of clusters in each class. nk = np.zeros(n_classes) for k in range(n_classes): centroid_mask = labels == k nk[k] = np.sum(centroid_mask) centroids[k] = X[centroid_mask].mean(axis=0) return centroids
[docs]def compute_contingency_matrix(y_true, y_pred): unique_true = np.unique(y_true) unique_pred = np.unique(y_pred) num_true = len(unique_true) num_pred = len(unique_pred) contingency = np.zeros((num_true, num_pred), dtype=np.int64) for i in range(len(y_true)): true_label = np.where(unique_true == y_true[i])[0] pred_label = np.where(unique_pred == y_pred[i])[0] contingency[true_label, pred_label] += 1 return contingency
[docs]def compute_entropy(labels): _, counts = np.unique(labels, return_counts=True) probabilities = counts / len(labels) return -np.sum(probabilities * np.log2(probabilities))
[docs]def compute_conditional_entropy(y_true, y_pred): unique_labels_pred = np.unique(y_true) entropy_sum = 0 for label in unique_labels_pred: mask = y_pred == label cluster_labels_true = y_true[mask] cluster_entropy = compute_entropy(cluster_labels_true) entropy_sum += len(cluster_labels_true) / len(y_true) * cluster_entropy return entropy_sum
[docs]def compute_homogeneity(y_true, y_pred): h_labels_true = compute_entropy(y_true) h_labels_true_given_pred = compute_conditional_entropy(y_true, y_pred) if h_labels_true == 0: return 1 else: return 1 - h_labels_true_given_pred / h_labels_true
[docs]def compute_confusion_matrix(y_true, y_pred, normalize=False): """ Computes the confusion matrix for a clustering problem given the true labels and the predicted labels. http://cran.nexr.com/web/packages/clusterCrit/vignettes/clusterCrit.pdf """ n = len(y_true) yy, yn, ny, nn = 0, 0, 0, 0 for i in range(n): for j in range(i+1, n): if y_true[i] == y_true[j] and y_pred[i] == y_pred[j]: yy += 1 elif y_true[i] == y_true[j] and y_pred[i] != y_pred[j]: yn += 1 elif y_true[i] != y_true[j] and y_pred[i] == y_pred[j]: ny += 1 else: nn += 1 res = np.array([yy, yn, ny, nn]) if normalize: return res/np.sum(res) return res