Source code for neurokit2.stats.cluster_findnumber

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

from .cluster import cluster
from .cluster_quality import cluster_quality


[docs] def cluster_findnumber(data, method="kmeans", n_max=10, show=False, **kwargs): """**Optimal Number of Clusters** Find the optimal number of clusters based on different indices of quality of fit. Parameters ---------- data : np.ndarray An array (channels, times) of M/EEG data. method : str The clustering algorithm to be passed into :func:`.nk.cluster`. n_max : int Runs the clustering alogrithm from 1 to n_max desired clusters in :func:`.nk.cluster` with quality metrices produced for each cluster number. show : bool Plot indices normalized on the same scale. **kwargs Other arguments to be passed into :func:`.nk.cluster` and :func:`.nk.cluster_quality`. Returns ------- DataFrame The different quality scores for each number of clusters: * Score_Silhouette * Score_Calinski * Score_Bouldin * Score_VarianceExplained * Score_GAP * Score_GAPmod * Score_GAP_diff * Score_GAPmod_diff See Also -------- cluster, cluster_quality Examples ---------- .. ipython:: python import neurokit2 as nk # Load the iris dataset data = nk.data("iris").drop("Species", axis=1) # How many clusters @savefig p_cluster_findnumber1.png scale=100% results = nk.cluster_findnumber(data, method="kmeans", show=True) @suppress plt.close() """ results = [] for i in range(1, n_max): # Cluster clustering, clusters, info = cluster(data, method=method, n_clusters=i, **kwargs) # Compute indices of clustering quality _, quality = cluster_quality(data, clustering, clusters, info, **kwargs) results.append(quality) results = pd.concat(results, axis=0).reset_index(drop=True) # Gap Diff results["Score_GAP_diff"] = ( results["Score_GAP"] - results["Score_GAP"].shift(-1) + results["Score_GAP_sk"].shift(-1) ) results["Score_GAPmod_diff"] = ( results["Score_GAPmod"] - results["Score_GAPmod"].shift(-1) + results["Score_GAPmod_sk"].shift(-1) ) results = results.drop(["Score_GAP_sk", "Score_GAPmod_sk"], axis=1) if show is True: normalized = (results - results.min()) / (results.max() - results.min()) normalized["n_Clusters"] = np.rint(np.arange(1, n_max)) normalized.columns = normalized.columns.str.replace("Score", "Normalized") normalized.plot(x="n_Clusters") return results