Source code for neurokit2.complexity.entropy_kl

import numpy as np
import pandas as pd
import scipy.spatial
import scipy.special

from .utils_complexity_embedding import complexity_embedding


[docs] def entropy_kl(signal, delay=1, dimension=2, norm="euclidean", **kwargs): """**Kozachenko-Leonenko (K-L) Differential entropy (KLEn)** The Kozachenko-Leonenko (K-L) estimate of the differential entropy is also referred to as the *nearest neighbor estimate* of entropy. Parameters ---------- signal : Union[list, np.array, pd.Series] The signal (i.e., a time series) in the form of a vector of values. delay : int Time delay (often denoted *Tau* :math:`\\tau`, sometimes referred to as *lag*) in samples. See :func:`complexity_delay` to estimate the optimal value for this parameter. dimension : int Embedding Dimension (*m*, sometimes referred to as *d* or *order*). See :func:`complexity_dimension` to estimate the optimal value for this parameter. norm : str The probability norm used when computing k-nearest neighbour distances. Can be ``"euclidean"`` (default) or ``"max"``. **kwargs : optional Other arguments (not used for now). Returns -------- klen : float The KL-entropy of the signal. info : dict A dictionary containing additional information regarding the parameters used to compute Differential entropy. See Also -------- entropy_differential Examples ---------- .. ipython:: python import neurokit2 as nk # Simulate a Signal with Laplace Noise signal = nk.signal_simulate(duration=2, frequency=5, noise=0.1) # Compute Kozachenko-Leonenko (K-L) Entropy klen, info = nk.entropy_kl(signal, delay=1, dimension=3) klen References ----------- * Gautama, T., Mandic, D. P., & Van Hulle, M. M. (2003, April). A differential entropy based method for determining the optimal embedding parameters of a signal. In 2003 IEEE International Conference on Acoustics, Speech, and Signal Processing, 2003. Proceedings. (ICASSP'03). (Vol. 6, pp. VI-29). IEEE. * Beirlant, J., Dudewicz, E. J., Györfi, L., & Van der Meulen, E. C. (1997). Nonparametric entropy estimation: An overview. International Journal of Mathematical and Statistical Sciences, 6(1), 17-39. * Kozachenko, L., & Leonenko, N. (1987). Sample estimate of the entropy of a random vector. Problemy Peredachi Informatsii, 23(2), 9-16. """ # Sanity checks if isinstance(signal, (np.ndarray, pd.DataFrame)) and signal.ndim > 1: raise ValueError( "Multidimensional inputs (e.g., matrices or multichannel data) are not supported yet." ) info = {"Dimension": dimension, "Delay": delay} # Time delay embedding embedded = complexity_embedding(signal, delay=delay, dimension=dimension) n, d = embedded.shape # Get distance to nearest neighbor for each delay vector # ------------------------------------------------------- # Using cKDTree is a much faster version than: # # Euclidean distance between vectors # dist = sklearn.metrics.DistanceMetric.get_metric(norm) # dist = dist.pairwise(embedded) # # Enforce non-zero # dist[np.isclose(dist, 0)] = np.nan # # pj is the Euclidean distance of the j-th delay vector to its nearest neighbor # nearest = np.nanmin(dist, axis=1) if norm == "max": # max norm p = np.inf log_c_d = 0 # volume of the d-dimensional unit ball elif norm == "euclidean": # euclidean norm p = 2 log_c_d = (d / 2.0) * np.log(np.pi) - np.log(scipy.special.gamma(d / 2.0 + 1)) else: raise ValueError("'norm' not recognized.") kdtree = scipy.spatial.cKDTree(embedded) # Query all points -- k+1 as query point also in initial set k = 1 # We want the first nearest neighbour (k = 0 would be itself) nearest, _ = kdtree.query(embedded, k + 1, eps=0, p=p) nearest = nearest[:, -1] # Enforce non-zero distances nearest = nearest[nearest > 0] # Compute entropy H # ------------------------------------------------------- # (In Gautama (2003), it's not divided by n but it is in Berilant (1997)) # (the *2 is because 2*radius=diameter) klen = np.sum(np.log(n * 2 * nearest) + np.log(2) + np.euler_gamma) / n # The above is what I understand from Gautama (2003)'s equation # But empirically the following seems more accurate. If someone could clarify / confirm that # it's the correct way (or not), that'd be great # (Also I don't fully understand the code below) # It was used in https://github.com/paulbrodersen/entropy_estimators/continuous.py sum_dist = np.sum(np.log(2 * nearest)) klen = sum_dist * (d / n) - scipy.special.digamma(k) + scipy.special.digamma(n) + log_c_d return klen, info