Source code for permetrics.utils.cluster_util

#!/usr/bin/env python
# Created by "Matt Q." at 23:05, 27/10/2022 --------%
#       Github: https://github.com/N3uralN3twork    %
#                                                   %
# Improved by: "Thieu" at 17:10, 25/07/2023 --------%
#       Email: nguyenthieu2102@gmail.com            %
#       Github: https://github.com/thieu1995        %
# --------------------------------------------------%

import numpy as np
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.spatial import distance_matrix
from scipy.stats import entropy as calculate_entropy
from scipy.sparse import coo_matrix
from collections import Counter


[docs]def compute_clusters(labels):
    """
    Get the dict of clusters and dict of cluster size
    """
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    dict_clusters = {label: np.where(labels == label)[0] for label in unique_labels}
    dict_cluster_sizes = {label: count for label, count in zip(unique_labels, label_counts)}
    return dict_clusters, dict_cluster_sizes


[docs]def compute_barycenters(X, labels):
    """
    Get the barycenter for each cluster and barycenter for all observations

    Args:
        X (np.ndarray): The features of datasets
        labels (np.ndarray): The predicted labels

    Returns:
        barycenters (np.ndarray): The barycenter for each clusters in form of matrix
        overall_barycenter (np.ndarray): the barycenter for all observations
    """
    n_samples, n_features = X.shape
    list_clusters = np.unique(labels)
    ## Mask mapping each class to its members.
    centroids = np.empty((len(list_clusters), n_features), dtype=np.float64)
    for idx, k in enumerate(list_clusters):
        centroids[idx] = X[labels == k].mean(axis=0)
    return centroids, np.mean(X, axis=0)


[docs]def compute_WG(X):
    # Compute the scatter matrix WG using Eq.11
    # Centering the column vectors
    means = np.mean(X, axis=0)
    centered_X = X - means
    # Computing the scatter matrix
    scatter_matrix = np.dot(centered_X.T, centered_X)
    return scatter_matrix


[docs]def compute_TSS(X):
    # The total scattering TSS (total sum of squares)
    # Computing the scatter matrix
    scatter_matrix = compute_WG(X)
    # Computing the total sum of squares (TSS)
    TSS = np.trace(scatter_matrix)
    return TSS


[docs]def compute_WGSS(X, labels):
    """
    Calculate the pooled within-cluster sum of squares WGSS or The within-cluster variance
    """
    barycenters, overall_barycenter = compute_barycenters(X, labels)
    n_clusters = len(barycenters)
    within_var = 0.0
    for i in range(n_clusters):
        cluster_mask = labels == i
        cluster_mean = np.mean(X[cluster_mask], axis=0)
        within_var += np.sum(np.sum((X[cluster_mask] - cluster_mean) ** 2, axis=1))
    return within_var


[docs]def compute_BGSS(X, labels):
    """
    The between-group dispersion BGSS or between-cluster variance
    """
    barycenters, overall_barycenter = compute_barycenters(X, labels)
    n_clusters = len(barycenters)
    # Calculate the overall mean of the data
    overall_mean = np.mean(X, axis=0)

    # # Calculate between-cluster variance and cluster sizes
    # cluster_sizes = np.bincount(labels, minlength=n_clusters)
    # cluster_means = np.array([np.mean(X[labels == i], axis=0) for i in range(n_clusters)])
    # between_var = np.sum(cluster_sizes * np.sum((cluster_means - overall_mean) ** 2, axis=1))
    # return between_var

    # Calculate the between-cluster variance
    between_var = 0.0
    for i in range(n_clusters):
        cluster_mask = labels == i
        cluster_size = np.sum(cluster_mask)
        cluster_mean = np.mean(X[cluster_mask], axis=0)
        between_var += cluster_size * np.sum((cluster_mean - overall_mean) ** 2)
    return between_var


[docs]def compute_contingency_matrix(y_true, y_pred):
    unique_true, true_inverse = np.unique(y_true, return_inverse=True)
    unique_pred, pred_inverse = np.unique(y_pred, return_inverse=True)
    contingency = coo_matrix((np.ones_like(y_true), (true_inverse, pred_inverse)),
                             shape=(len(unique_true), len(unique_pred)), dtype=np.int64).toarray()
    return contingency


[docs]def compute_confusion_matrix(y_true, y_pred, normalize=False):
    """
    Computes the confusion matrix for a clustering problem given the true labels and the predicted labels.
    http://cran.nexr.com/web/packages/clusterCrit/vignettes/clusterCrit.pdf
    """
    n = len(y_true)
    yy = yn = ny = nn = 0.
    for i in range(n - 1):
        y_true_diff = y_true[i + 1:] == y_true[i]
        y_pred_diff = y_pred[i + 1:] == y_pred[i]
        yy += np.sum(y_true_diff & y_pred_diff)
        yn += np.sum(y_true_diff & ~y_pred_diff)
        ny += np.sum(~y_true_diff & y_pred_diff)
        nn += np.sum(~y_true_diff & ~y_pred_diff)
    res = np.array([yy, yn, ny, nn], dtype=np.int64)
    if normalize:
        return res / np.sum(res)
    return res


[docs]def calculate_sum_squared_error_index(X=None, y_pred=None):
    centers, _ = compute_barycenters(X, y_pred)
    centroid_distances = centers[y_pred]
    squared_distances = np.sum((X - centroid_distances) ** 2, axis=1)
    return np.sum(squared_distances)


[docs]def calculate_mean_squared_error_index(X=None, y_pred=None):
    centers, _ = compute_barycenters(X, y_pred)
    centroid_distances = centers[y_pred]
    squared_distances = np.sum((X - centroid_distances) ** 2, axis=1)
    return np.mean(squared_distances)


[docs]def calculate_ball_hall_index(X=None, y_pred=None):
    n_clusters = len(np.unique(y_pred))
    wgss = 0
    ## For each cluster, find the centroid and then the within-group SSE
    for k in range(n_clusters):
        centroid_mask = y_pred == k
        cluster_k = X[centroid_mask]
        centroid = np.mean(cluster_k, axis=0)
        wgss += np.sum((cluster_k - centroid) ** 2)
    return wgss / n_clusters


[docs]def calculate_calinski_harabasz_index(X=None, y_pred=None, force_finite=True, finite_value=0.0):
    """
    Args:
        X: The X matrix features
        y_pred: The predicted results
        force_finite: Make result as finite number
        finite_value: The value that used to replace the infinite value or NaN value.

    Returns:
        The Calinski Harabasz Index
    """
    n_samples, _ = X.shape
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Calinski-Harabasz index is undefined when y_pred has only 1 cluster.")
    overall_mean = np.mean(X, axis=0)
    # Calculate between-cluster variance and cluster sizes
    cluster_sizes = np.bincount(y_pred, minlength=n_clusters)
    cluster_means = np.array([np.mean(X[y_pred == i], axis=0) for i in range(n_clusters)])
    between_var = np.sum(cluster_sizes * np.sum((cluster_means - overall_mean) ** 2, axis=1))
    # Calculate within-cluster variance
    within_var = np.sum((X - cluster_means[y_pred]) ** 2)
    # Calculate the CH Index
    res = (between_var / within_var) * ((n_samples - n_clusters) / (n_clusters - 1))
    return res


[docs]def calculate_xie_beni_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Xie-Beni index is undefined when y_pred has only 1 cluster.")
    # Get the centroids
    centroids, _ = compute_barycenters(X, y_pred)
    wgss = np.sum(np.min(cdist(X, centroids, metric='euclidean'), axis=1) ** 2)
    # Computing the minimum squared distance to the centroids:
    MinSqDist = np.min(pdist(centroids, metric='sqeuclidean'))
    res = (wgss / X.shape[0]) / MinSqDist
    return res


[docs]def calculate_banfeld_raftery_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    clusters_dict, cluster_sizes_dict = compute_clusters(y_pred)
    res = 0.0
    for k in clusters_dict.keys():
        X_k = X[clusters_dict[k]]
        cluster_dispersion = np.trace(compute_WG(X_k)) / cluster_sizes_dict[k]
        if cluster_sizes_dict[k] == 1:
            if force_finite:
                return finite_value
            else:
                raise ValueError("The Banfeld-Raftery index is undefined when at least 1 cluster has only 1 sample.")
        if cluster_sizes_dict[k] > 1:
            res += cluster_sizes_dict[k] * np.log(cluster_dispersion)
    return res


[docs]def calculate_davies_bouldin_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    clusters_dict, cluster_sizes_dict = compute_clusters(y_pred)
    centers, _ = compute_barycenters(X, y_pred)
    n_clusters = len(clusters_dict)
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Davies-Bouldin index is undefined when y_pred has only 1 cluster.")
    # Calculate delta for each cluster
    delta = {}
    for k in range(n_clusters):
        X_k = X[clusters_dict[k]]
        delta[k] = np.mean(np.linalg.norm(X_k - centers[k], axis=1))
    # Calculate the Davies-Bouldin index
    cc = 0.0
    for kdx in range(n_clusters):
        list_dist = []
        for jdx in range(n_clusters):
            if jdx != kdx:
                m = (delta[kdx] + delta[jdx]) / np.linalg.norm(centers[kdx] - centers[jdx])
                list_dist.append(m)
        cc += np.max(list_dist)
    return cc / n_clusters


[docs]def calculate_det_ratio_index(X=None, y_pred=None, force_finite=True, finite_value=-1e10):
    clusters_dict, cluster_sizes_dict = compute_clusters(y_pred)
    centers, _ = compute_barycenters(X, y_pred)
    T = compute_WG(X)
    scatter_matrices = np.zeros((X.shape[1], X.shape[1]))  # shape of (n_features, n_features)
    for label, indices in clusters_dict.items():
        # Retrieve data points for the current cluster
        X_k = X[indices]
        # Compute within-group scatter matrix for the current cluster
        scatter_matrices += compute_WG(X_k)
    t1 = np.linalg.det(scatter_matrices)
    if t1 == 0:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Det-Ratio index is undefined when determinant of matrix is 0.")
    return np.linalg.det(T) / t1


[docs]def calculate_dunn_index(X=None, y_pred=None, use_modified=True, force_finite=True, finite_value=0.):
    centers, _ = compute_barycenters(X, y_pred)
    n_clusters = len(centers)
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Dunn index is undefined when y_pred has only 1 cluster.")
    # Calculate dmin
    dmin = np.inf
    if use_modified:
        for k0 in range(n_clusters - 1):
            for k1 in range(k0 + 1, n_clusters):
                points = X[y_pred == k1]
                dkk = np.min(cdist(points, centers[k0].reshape(1, -1), metric='euclidean'))
                dmin = min(dmin, np.min(dkk))
    else:
        for kdx in range(n_clusters - 1):
            for k0 in range(kdx + 1, n_clusters):
                points1 = X[y_pred == kdx]
                points2 = X[y_pred == k0]
                dkk = cdist(points1, points2, metric='euclidean')
                dmin = min(dmin, np.min(dkk))
    # Calculate dmax
    dmax = 0.0
    for kdx in range(n_clusters):
        points = X[y_pred == kdx]
        dk = np.max(pdist(points, metric="euclidean"))
        dmax = max(dmax, dk)
    return dmin / dmax


[docs]def calculate_ksq_detw_index(X=None, y_pred=None, use_normalized=True):
    centers, _ = compute_barycenters(X, y_pred)
    scatter_matrices = np.zeros((X.shape[1], X.shape[1]))  # shape of (n_features, n_features)
    for kdx in range(len(centers)):
        X_k = X[y_pred == kdx]
        scatter_matrices += compute_WG(X_k)
    if use_normalized:
        scatter_matrices = (scatter_matrices - np.min(scatter_matrices)) / (np.max(scatter_matrices) - np.min(scatter_matrices))
    res = len(centers) ** 2 * np.linalg.det(scatter_matrices)
    return res


[docs]def calculate_log_det_ratio_index(X=None, y_pred=None, force_finite=True, finite_value=-1e10):
    clusters_dict, cluster_sizes_dict = compute_clusters(y_pred)
    centers, _ = compute_barycenters(X, y_pred)
    T = compute_WG(X)
    WG = np.zeros((X.shape[1], X.shape[1]))  # shape of (n_features, n_features)
    for label, indices in clusters_dict.items():
        X_k = X[indices]
        WG += compute_WG(X_k)
    t2 = np.linalg.det(WG)
    if t2 == 0:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Log Det Ratio Index is undefined when determinant of matrix WG is 0.")
    t1 = np.linalg.det(T) / t2
    if t1 <= 0:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Log Det Ratio Index is undefined when det(T)/det(WG) <= 0.")
    return X.shape[0] * np.log(t1)


[docs]def calculate_silhouette_index_ver1(X=None, y_pred=None):
    dm = distance_matrix(X, X)
    res = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        a = np.mean(dm[i, y_pred == y_pred[i]])  # Cohesion
        b_values = [np.mean(dm[i, y_pred == label]) for label in np.unique(y_pred) if label != y_pred[i]]
        b = np.min(b_values) if len(b_values) > 0 else 0  # Separation
        res[i] = (b - a) / max(a, b)
    return np.mean(res)


[docs]def calculate_silhouette_index_ver2(X=None, y_pred=None, multi_output=False, force_finite=True, finite_value=-1.):
    unique_clusters = np.unique(y_pred)
    if len(unique_clusters) == 1:
        raise ValueError("The Silhouette Score is undefined when y_pred has only 1 cluster.")
    num_points = len(X)
    pairwise_distances = cdist(X, X)
    silhouette_scores = np.zeros(num_points)
    for i in range(num_points):
        point_i = X[i]
        cluster_i = y_pred[i]
        # Calculate the average distance within the same cluster (a_i)
        mask_i = y_pred == cluster_i
        a_i = np.mean(pairwise_distances[i, mask_i])
        # Calculate the average distance to the nearest neighboring cluster (b_i)
        b_i = np.min([np.mean(pairwise_distances[i, y_pred == cluster_j])
                      for cluster_j in unique_clusters if cluster_j != cluster_i])
        # Calculate the silhouette score for the point
        silhouette_scores[i] = (b_i - a_i) / max(a_i, b_i)
    return np.mean(silhouette_scores)


[docs]def calculate_silhouette_index_ver3(X=None, y_pred=None, multi_output=False, force_finite=True, finite_value=-1.):
    unique_clusters = np.unique(y_pred)
    if len(unique_clusters) == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Silhouette Index is undefined when y_pred has only 1 cluster.")
    num_clusters = len(unique_clusters)
    num_points = len(X)
    # Precompute pairwise distances
    pairwise_distances_matrix = cdist(X, X)
    a_values = np.zeros(num_points)
    b_values = np.zeros(num_points)
    for i in range(num_clusters):
        mask_i = y_pred == unique_clusters[i]
        mask_i_indices = np.where(mask_i)[0]
        a_values_i = np.sum(pairwise_distances_matrix[mask_i_indices][:, mask_i_indices], axis=1) / np.sum(mask_i)
        a_values[mask_i_indices] = a_values_i
        b_values_i = np.min([
            np.sum(pairwise_distances_matrix[mask_i_indices][:, y_pred == unique_clusters[j]], axis=1) / np.sum(y_pred == unique_clusters[j])
            for j in range(num_clusters) if j != i], axis=0)
        b_values[mask_i_indices] = b_values_i
    results = (b_values - a_values) / np.maximum(a_values, b_values)
    if multi_output:
        return results
    return np.mean(results)


[docs]def calculate_silhouette_index(X=None, y_pred=None, multi_output=False, force_finite=True, finite_value=-1.):
    """
    Calculates the silhouette score for a given clustering.

    Args:
      data: A numpy array of shape (n_samples, n_features) representing the data points.
      labels: A numpy array of shape (n_samples,) containing the cluster labels for each data point.

    Returns:
      The silhouette score, a value between -1 and 1.
    """
    unique_clusters = np.unique(y_pred)
    if len(unique_clusters) == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Silhouette Index is undefined when y_pred has only 1 cluster.")

    n_samples, n_features = X.shape
    n_clusters = len(unique_clusters)

    # Calculate pairwise distances between data points
    distances = np.linalg.norm(X[:, np.newaxis, :] - X[np.newaxis, :, :], axis=2)

    # Calculate centers
    centers = np.zeros((n_clusters, n_features))
    for i in range(n_clusters):
        x = X[y_pred == i]
        centers[i] = np.mean(x, axis=0)

    res = np.zeros(n_samples)
    for i in range(n_samples):
        # Calculate average distance to the points within the same cluster for each sample
        cluster_i_indices = np.where(y_pred == y_pred[i])[0]
        aa = np.mean(distances[i, cluster_i_indices])

        # Find the nearest cluster that this i sample doesn't belong to
        bb = np.inf
        for j in range(n_clusters):
            if j != y_pred[i]:
                bb = min(bb, np.linalg.norm(X[i] - centers[j]))
        # Calculate silhouette coefficient for each sample
        res[i] = (bb - aa) / max(aa, bb)

    # Return the average silhouette score
    if multi_output:
        return res
    return np.mean(res)


[docs]def calculate_duda_hart_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    # Find the unique cluster labels
    unique_labels = np.unique(y_pred)
    if len(unique_labels) == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Duda-Hart index is undefined when y_pred has only 1 cluster.")
    # Compute the pairwise distances between data points
    pairwise_distances = cdist(X, X)
    # Initialize the numerator and denominator for Duda index calculation
    intra_cluster_distances = 0
    inter_cluster_distances = 0
    # Iterate over each unique cluster label
    for label in unique_labels:
        # Find the indices of data points in the current cluster
        cluster_indices = np.where(y_pred == label)[0]
        # Compute the average pairwise distance within the current cluster
        intra_cluster_distances += np.mean(pairwise_distances[np.ix_(cluster_indices, cluster_indices)])
        # Compute the average pairwise distance to other clusters
        other_cluster_indices = np.where(y_pred != label)[0]
        inter_cluster_distances += np.mean(pairwise_distances[np.ix_(cluster_indices, other_cluster_indices)])
    # Calculate the Duda index
    result = intra_cluster_distances / inter_cluster_distances
    return result


[docs]def calculate_beale_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Beale index is undefined when y_pred has only 1 cluster.")
    n_samples, n_features = X.shape
    centers, _ = compute_barycenters(X, y_pred)
    sse_within = 0
    sse_between = 0
    for k in range(n_clusters):
        sse_within += np.sum((X[y_pred == k] - centers[k]) ** 2)
        sse_between += np.sum((centers[k] - np.mean(X, axis=0)) ** 2)
    df_within = n_samples - n_clusters
    df_between = n_clusters - 1
    ms_within = sse_within / df_within
    ms_between = sse_between / df_between
    result = ms_within / ms_between
    return result


[docs]def calculate_r_squared_index(X=None, y_pred=None):
    n_clusters = len(np.unique(y_pred))
    total_var = np.var(X, axis=0).sum()
    var_within = 0
    for k in range(n_clusters):
        var_within += np.var(X[y_pred == k], axis=0).sum()
    result = (total_var - var_within) / total_var
    return result


[docs]def calculate_density_based_clustering_validation_index(X=None, y_pred=None, force_finite=True, finite_value=1.):
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Density-based Clustering Validation Index is undefined when y_pred has only 1 cluster.")
    n_samples, n_features = X.shape
    centroids = np.zeros((n_clusters, n_features))
    for k in range(n_clusters):
        centroids[k] = np.mean(X[y_pred == k], axis=0)
    intra_cluster_distances = cdist(X, centroids, 'euclidean')
    min_inter_cluster_distances = np.zeros(n_samples)
    for i in range(n_samples):
        mask = np.ones(n_samples, dtype=bool)
        mask[i] = False
        mask[y_pred == y_pred[i]] = False
        if np.sum(mask) > 0:
            min_inter_cluster_distances[i] = np.min(cdist(X[i, :].reshape(1, -1), X[mask, :], 'euclidean'))
        else:
            min_inter_cluster_distances[i] = np.inf
    result = np.mean(intra_cluster_distances / np.maximum(min_inter_cluster_distances.reshape(-1, 1), intra_cluster_distances), axis=0).mean()
    return result


[docs]def calculate_hartigan_index(X=None, y_pred=None, force_finite=True, finite_value=1e10):
    centroids, _ = compute_barycenters(X, y_pred)
    num_clusters = len(np.unique(y_pred))
    if num_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Hartigan Index is undefined when y_pred has only 1 cluster.")
    hi = 0.0
    for idx in range(num_clusters):
        cluster_data = X[y_pred == idx]
        cluster_centroid = centroids[idx]

        distances_within_cluster = cdist(cluster_data, [cluster_centroid], metric='euclidean') ** 2
        sum_distances_within_cluster = np.sum(distances_within_cluster)

        other_centroids = np.delete(centroids, idx, axis=0)
        closest_other_centroid_index = np.argmin(np.linalg.norm(cluster_centroid - other_centroids, axis=1))
        closest_other_centroid = other_centroids[closest_other_centroid_index]

        distances_to_closest_other_cluster = cdist(cluster_data, [closest_other_centroid], metric='euclidean') ** 2
        sum_distances_to_closest_other_cluster = np.sum(distances_to_closest_other_cluster)

        hi += sum_distances_within_cluster / sum_distances_to_closest_other_cluster
    return hi


[docs]def calculate_mutual_info_score(y_true=None, y_pred=None):
    contingency_matrix = compute_contingency_matrix(y_true, y_pred)
    # Convert contingency matrix to probability matrix
    contingency_matrix = contingency_matrix / y_true.shape[0]
    # Calculate marginal probabilities
    cluster_probs_true = np.sum(contingency_matrix, axis=1)
    cluster_probs_pred = np.sum(contingency_matrix, axis=0)
    # Calculate mutual information
    n_clusters_true = len(np.unique(y_true))
    n_clusters_pred = len(np.unique(y_pred))
    mi = 0.0
    for idx in range(n_clusters_true):
        for jdx in range(n_clusters_pred):
            if contingency_matrix[idx, jdx] > 0.0:
                mi += contingency_matrix[idx, jdx] * np.log(contingency_matrix[idx, jdx] / (cluster_probs_true[idx] * cluster_probs_pred[jdx]))
    return mi


[docs]def calculate_normalized_mutual_info_score(y_true=None, y_pred=None, force_finite=True, finite_value=0.0):
    mi = calculate_mutual_info_score(y_true, y_pred)
    n_samples = y_true.shape[0]
    n_clusters_true = len(np.unique(y_true))
    n_clusters_pred = len(np.unique(y_pred))
    if n_clusters_true == 1 or n_clusters_pred == 1 or mi == 0:
        # If either of the clusterings has only one cluster, MI is not defined
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Normalized Mutual Info Score is undefined when MIS = 0 or y_true, y_pred has only 1 cluster.")
    # Calculate entropy of true and predicted clusterings
    entropy_true = -np.sum((np.bincount(y_true) / n_samples) * np.log(np.bincount(y_true) / n_samples))
    entropy_pred = -np.sum((np.bincount(y_pred) / n_samples) * np.log(np.bincount(y_pred) / n_samples))
    # Calculate normalized mutual information
    denominator = (entropy_true + entropy_pred) / 2.0
    if denominator == 0:
        return 1.0  # Perfect agreement when both entropies are 0 (all samples in one cluster)
    nmi = mi / denominator
    return nmi


[docs]def calculate_rand_score(y_true=None, y_pred=None):
    n_samples = np.int64(y_true.shape[0])
    contingency = compute_contingency_matrix(y_true, y_pred)
    n_c = np.ravel(contingency.sum(axis=1))
    n_k = np.ravel(contingency.sum(axis=0))
    sum_squares = (contingency**2).sum()
    C = np.empty((2, 2), dtype=np.int64)
    C[1, 1] = sum_squares - n_samples
    C[0, 1] = contingency.dot(n_k).sum() - sum_squares
    C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
    C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares
    numerator = C.diagonal().sum()
    denominator = C.sum()
    if numerator == denominator or denominator == 0:
        # Special limit cases: no clustering since the data is not split; or trivial clustering where each
        # document is assigned a unique cluster. These are perfect matches hence return 1.0.
        return 1.0
    return numerator / denominator


[docs]def calculate_adjusted_rand_score(y_true=None, y_pred=None):
    n_samples = np.int64(y_true.shape[0])
    contingency = compute_contingency_matrix(y_true, y_pred)
    n_c = np.ravel(contingency.sum(axis=1))
    n_k = np.ravel(contingency.sum(axis=0))
    sum_squares = (contingency**2).sum()
    C = np.empty((2, 2), dtype=np.int64)
    C[1, 1] = sum_squares - n_samples
    C[0, 1] = contingency.dot(n_k).sum() - sum_squares
    C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
    C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares
    (tn, fp), (fn, tp) = C
    # convert to Python integer types, to avoid overflow or underflow
    tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp)
    # Special cases: empty data or full agreement
    if fn == 0 and fp == 0:
        return 1.0
    res = 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))
    return res


[docs]def calculate_fowlkes_mallows_score(y_true=None, y_pred=None, force_finite=True, finite_value=0.0):
    (n_samples,) = y_true.shape
    c = compute_contingency_matrix(y_true, y_pred)
    c = c.astype(np.int64, copy=False)
    tk = np.dot(c.ravel(), c.ravel()) - n_samples
    pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
    qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples
    if pk == 0.0 or qk == 0.0:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Fowlkes Mallows Score is undefined when pk = 0 or qk = 0.")
    res = np.sqrt(tk / pk) * np.sqrt(tk / qk)
    return res


[docs]def compute_entropy(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    return -np.sum(probabilities * np.log2(probabilities))


[docs]def compute_conditional_entropy(y_true, y_pred):
    unique_labels_pred = np.unique(y_true)
    entropy_sum = 0
    for label in unique_labels_pred:
        mask = y_pred == label
        cluster_labels_true = y_true[mask]
        cluster_entropy = compute_entropy(cluster_labels_true)
        entropy_sum += len(cluster_labels_true) / len(y_true) * cluster_entropy
    return entropy_sum


[docs]def calculate_homogeneity_score(y_true=None, y_pred=None):
    h_labels_true = compute_entropy(y_true)
    h_labels_true_given_pred = compute_conditional_entropy(y_true, y_pred)
    if h_labels_true == 0:
        res = 1.0
    else:
        res = 1. - h_labels_true_given_pred / h_labels_true
    return res


[docs]def calculate_completeness_score(y_true=None, y_pred=None):
    return calculate_homogeneity_score(y_pred, y_true)


[docs]def calculate_v_measure_score(y_true=None, y_pred=None):
    h = calculate_homogeneity_score(y_true, y_pred)
    c = calculate_completeness_score(y_true, y_pred)
    if h + c == 0:
        res = 0
    else:
        res = 2 * (h * c) / (h + c)
    return res


[docs]def calculate_precision_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    return yy / (yy + ny)


[docs]def calculate_recall_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    return yy / (yy + yn)


[docs]def calculate_f_measure_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    p = yy / (yy + ny)
    r = yy / (yy + yn)
    return 2 * p * r / (p + r)


[docs]def calculate_czekanowski_dice_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    return 2 * yy / (2 * yy + yn + ny)


[docs]def calculate_hubert_gamma_score(y_true=None, y_pred=None, force_finite=True, finite_value=-1.0):
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Hubert Gamma score is undefined when y_pred has only 1 cluster.")
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    NT = yy + yn + ny + nn
    res = (NT*yy - (yy+yn)*(yy+ny)) / np.sqrt((yy+yn)*(yy+ny)*(nn+yn)*(nn+ny))
    return res


[docs]def calculate_jaccard_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    return yy / (yy + yn + ny)


[docs]def calculate_kulczynski_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    res = 0.5 * ((yy / (yy + ny)) + (yy / (yy + yn)))
    return res


[docs]def calculate_mc_nemar_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    return (nn - ny) / np.sqrt(nn + ny)


[docs]def calculate_phi_score(y_true=None, y_pred=None, force_finite=True, finite_value=-1e10):
    n_clusters = len(np.unique(y_pred))
    if n_clusters == 1:
        if force_finite:
            return finite_value
        else:
            raise ValueError("The Phi score is undefined when y_pred has only 1 cluster.")
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    numerator = yy * nn - yn * ny
    denominator = (yy + yn) * (yy + ny) * (yn + nn) * (ny + nn)
    return numerator / denominator


[docs]def calculate_rogers_tanimoto_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    cc = (yy + nn) / (yy + nn + 2 * (yn + ny))
    return cc


[docs]def calculate_russel_rao_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    NT = yy + yn + ny + nn
    return yy / NT


[docs]def calculate_sokal_sneath1_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    cc = yy / (yy + 2 * (yn + ny))
    return cc


[docs]def calculate_sokal_sneath2_score(y_true=None, y_pred=None):
    yy, yn, ny, nn = compute_confusion_matrix(y_true, y_pred, normalize=True)
    cc = (yy + nn) / (yy + nn + 0.5 * (yn + ny))
    return cc


[docs]def calculate_purity_score(y_true=None, y_pred=None):
    # Find the number of data points
    N = len(y_true)
    # Find the unique class labels in the true labels
    unique_classes = np.unique(y_true)
    # Initialize the purity score
    purity = 0
    # Iterate over each unique class label
    for c in unique_classes:
        # Find the indices of data points with the current class label in the true labels
        class_indices = np.where(y_true == c)[0]
        # Find the corresponding predicted labels for these data points
        class_predictions = y_pred[class_indices]
        # Count the occurrences of each predicted label
        class_predictions = np.round(class_predictions).astype(int)
        class_counts = np.bincount(class_predictions)
        # Add the size of the majority class to the purity score
        purity += np.max(class_counts)
    # Normalize the purity score by dividing by the total number of data points
    return purity / N


[docs]def calculate_entropy_score(y_true=None, y_pred=None):
    # Find the number of data points
    N = len(y_true)
    # Find the unique class labels in the true labels
    unique_classes = np.unique(y_true)
    result = 0
    # Iterate over each unique class label
    for c in unique_classes:
        # Find the indices of data points with the current class label in the true labels
        class_indices = np.where(y_true == c)[0]
        # Find the corresponding predicted labels for these data points
        class_predictions = y_pred[class_indices]
        class_predictions = np.round(class_predictions).astype(int)
        # Count the occurrences of each predicted label
        class_counts = np.bincount(class_predictions)
        # Normalize the class counts by dividing by the total number of data points in the cluster
        class_distribution = class_counts / len(class_predictions)
        # Compute the entropy of the cluster
        cluster_entropy = calculate_entropy(class_distribution, base=2)
        # Weight the entropy by the relative size of the cluster
        cluster_size = len(class_indices)
        result += (cluster_size / N) * cluster_entropy
    return result


[docs]def compute_nd_splus_sminus_t(y_true=None, y_pred=None):
    """concordant_discordant"""
    n_samples = len(y_true)
    nd = n_samples * (n_samples - 1) / 2
    s_plus = 0.  # Number of concordant comparisons
    t = 0.  # Number of comparisons of two pairs of objects with same cluster labels
    for idx in range(n_samples - 1):
        t += np.sum((y_true[idx] == y_true[idx + 1:]) & (y_pred[idx] == y_pred[idx + 1:]))
        s_plus += np.sum((y_true[idx] == y_true[idx + 1:]) & (y_pred[idx] == y_pred[idx + 1:]))
        s_plus += np.sum((y_true[idx] != y_true[idx + 1:]) & (y_pred[idx] != y_pred[idx + 1:]))
    s_minus = nd - s_plus       # Number of discordant comparisons
    return nd, s_plus, s_minus, t


[docs]def calculate_tau_score(y_true=None, y_pred=None):
    """
    Cluster Validation for Mixed-Type Data: Paper
    """
    nd, s_plus, s_minus, t = compute_nd_splus_sminus_t(y_true, y_pred)
    res = (s_plus - s_minus) / np.sqrt((nd - t) * nd)
    return res


[docs]def calculate_gamma_score(y_true=None, y_pred=None):
    """
    Cluster Validation for Mixed-Type Data: Paper
    """
    nd, s_plus, s_minus, t = compute_nd_splus_sminus_t(y_true, y_pred)
    res = (s_plus - s_minus) / (s_plus + s_minus)
    return res


[docs]def calculate_gplus_score(y_true=None, y_pred=None):
    """
    Cluster Validation for Mixed-Type Data: Paper
    """
    nd, s_plus, s_minus, t = compute_nd_splus_sminus_t(y_true, y_pred)
    res = s_minus / nd
    return res