Source code for neurokit2.complexity.optim_complexity_tolerance

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import scipy.spatial
import sklearn.neighbors

from ..stats import density
from .utils_complexity_embedding import complexity_embedding
from .utils_entropy import _entropy_apen



[docs]
def complexity_tolerance(
    signal, method="maxApEn", r_range=None, delay=None, dimension=None, show=False
):
    """**Automated selection of tolerance (r)**

    Estimate and select the optimal tolerance (*r*) parameter used by other entropy and other
    complexity algorithms.

    Many complexity algorithms are built on the notion of self-similarity and recurrence, and how
    often a system revisits its past states. Considering two states as identical is straightforward
    for discrete systems (e.g., a sequence of ``"A"``, ``"B"`` and ``"C"`` states), but for
    continuous signals, we cannot simply look for when the two numbers are exactly the same.
    Instead, we have to pick a threshold by which to consider two points as similar.

    The tolerance *r* is essentially this threshold value (the numerical difference between two
    similar points that we "tolerate"). This parameter has a critical impact and is a major
    source of inconsistencies in the literature.

    Different methods have been described to estimate the most appropriate tolerance value:

    * **maxApEn**: Different values of tolerance will be tested and the one where the approximate
      entropy (ApEn) is maximized will be selected and returned (Chen, 2008).
    * **recurrence**: The tolerance that yields a recurrence rate (see ``RQA``) close to 1% will
      be returned. Note that this method is currently not suited for very long signals, as it is
      based on a recurrence matrix, which size is close to n^2. Help is needed to address this
      limitation.
    * **neighbours**: The tolerance that yields a number of nearest neighbours (NN) close to 2% will
      be returned.

    As these methods are computationally expensive, other fast heuristics are available:

    * **sd**: r = 0.2 * standard deviation (SD) of the signal will be returned. This is the most
      commonly used value in the literature, though its appropriateness is questionable.
    * **makowski**: Adjusted value based on the SD, the embedding dimension and the signal's
      length. See our `study <https://github.com/DominiqueMakowski/ComplexityTolerance>`_.
    * **nolds**: Adjusted value based on the SD and the dimension. The rationale is that
      the chebyshev distance (used in various metrics) rises logarithmically with increasing
      dimension. ``0.5627 * np.log(dimension) + 1.3334`` is the logarithmic trend line for the
      chebyshev distance of vectors sampled from a univariate normal distribution. A constant of
      ``0.1164`` is used so that ``tolerance = 0.2 * SDs`` for ``dimension = 2`` (originally in
      https://github.com/CSchoel/nolds).
    * **singh2016**: Makes a histogram of the Chebyshev distance matrix and returns the upper bound
      of the modal bin.
    * **chon2009**: Acknowledging that computing multiple ApEns is computationally expensive, Chon
      (2009) suggested an approximation based a heuristic algorithm that takes into account the
      length of the signal, its short-term and long-term variability, and the embedding dimension
      *m*. Initially defined only for *m* in [2-7], we expanded this to work with value of *m*
      (though the accuracy is not guaranteed beyond *m* = 4).


    Parameters
    ----------
    signal : Union[list, np.array, pd.Series]
        The signal (i.e., a time series) in the form of a vector of values.
    method : str
        Can be ``"maxApEn"`` (default), ``"sd"``, ``"recurrence"``, ``"neighbours"``, ``"nolds"``,
        ``"chon2009"``, or ``"neurokit"``.
    r_range : Union[list, int]
        The range of tolerance values (or the number of values) to test. Only used if ``method`` is
        ``"maxApEn"`` or ``"recurrence"``. If ``None`` (default), the default range will be used;
        ``np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)`` for ``"maxApEn"``, and ``np.
        linspace(0, np.max(d), 30 + 1)[1:]`` for ``"recurrence"``. You can set a lower number for
        faster results.
    delay : int
        Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
    dimension : int
        Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
    show : bool
        If ``True`` and method is ``"maxApEn"``, will plot the ApEn values for each value of r.

    See Also
    --------
    complexity, complexity_delay, complexity_dimension, complexity_embedding

    Returns
    ----------
    float
        The optimal tolerance value.
    dict
        A dictionary containing additional information.

    Examples
    ----------
    * **Example 1**: The method based on the SD of the signal is fast. The plot shows the d
      distribution of the values making the signal, and the width of the arrow represents the
      chosen ``r`` parameter.

    .. ipython:: python

      import neurokit2 as nk

      # Simulate signal
      signal = nk.signal_simulate(duration=2, frequency=[5, 7, 9, 12, 15])

      # Fast method (based on the standard deviation)
      @savefig p_complexity_tolerance1.png scale=100%
      r, info = nk.complexity_tolerance(signal, method = "sd", show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    The dimension can be taken into account:
    .. ipython:: python

      # nolds method
      @savefig p_complexity_tolerance2.png scale=100%
      r, info = nk.complexity_tolerance(signal, method = "nolds", dimension=3, show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r


    * **Example 2**: The method based on the recurrence rate will display the rates according to
      different values of tolerance. The horizontal line indicates 5%.

    .. ipython:: python

      @savefig p_complexity_tolerance3.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=1, dimension=10,
                                        method = 'recurrence', show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    An alternative, better suited for long signals is to use nearest neighbours.

    .. ipython:: python

      @savefig p_complexity_tolerance4.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=1, dimension=10,
                                        method = 'neighbours', show=True)
      @suppress
      plt.close()

    Another option is to use the density of distances.

    .. ipython:: python

      @savefig p_complexity_tolerance5.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=1, dimension=3,
                                        method = 'bin', show=True)
      @suppress
      plt.close()

    * **Example 3**: The default method selects the tolerance at which *ApEn* is maximized.

    .. ipython:: python

      # Slow method
      @savefig p_complexity_tolerance6.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=8, dimension=6,
                                        method = 'maxApEn', show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    * **Example 4**: The tolerance values that are tested can be modified to get a more precise
      estimate.

    .. ipython:: python

      # Narrower range
      @savefig p_complexity_tolerance7.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=8, dimension=6, method = 'maxApEn',
                                        r_range=np.linspace(0.002, 0.8, 30), show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    References
    -----------
    * Chon, K. H., Scully, C. G., & Lu, S. (2009). Approximate entropy for all signals. IEEE
      engineering in medicine and biology magazine, 28(6), 18-23.
    * Lu, S., Chen, X., Kanters, J. K., Solomon, I. C., & Chon, K. H. (2008). Automatic selection of
      the threshold value r for approximate entropy. IEEE Transactions on Biomedical Engineering,
      55(8), 1966-1972.
    * Chen, X., Solomon, I. C., & Chon, K. H. (2008). Parameter selection criteria in approximate
      entropy and sample entropy with application to neural respiratory signals. Am. J. Physiol.
      Regul. Integr. Comp. Physiol.
    * Singh, A., Saini, B. S., & Singh, D. (2016). An alternative approach to approximate entropy
      threshold value (r) selection: application to heart rate variability and systolic blood
      pressure variability under postural challenge. Medical & biological engineering & computing,
      54(5), 723-732.

    """
    if not isinstance(method, str):
        return method, {"Method": "None"}

    # Method
    method = method.lower()
    if method in ["traditional", "sd", "std", "default"]:
        r = 0.2 * np.std(signal, ddof=1)
        info = {"Method": "20% SD"}

    elif method in ["adjusted_sd", "nolds"] and (
        isinstance(dimension, (int, float)) or dimension is None
    ):
        if dimension is None:
            raise ValueError("'dimension' cannot be empty for the 'nolds' method.")
        r = (
            0.11604738531196232
            * np.std(signal, ddof=1)
            * (0.5627 * np.log(dimension) + 1.3334)
        )
        info = {"Method": "Adjusted 20% SD"}

    elif method in ["chon", "chon2009"] and (
        isinstance(dimension, (int, float)) or dimension is None
    ):
        if dimension is None:
            raise ValueError("'dimension' cannot be empty for the 'chon2009' method.")
        sd1 = np.std(np.diff(signal), ddof=1)  # short-term variability
        sd2 = np.std(signal, ddof=1)  # long-term variability of the signal

        # Here are the 3 formulas from Chon (2009):
        # For m=2: r =(−0.036 + 0.26 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        # For m=3: r =(−0.08 + 0.46 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        # For m=4: r =(−0.12 + 0.62 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        # For m=5: r =(−0.16 + 0.78 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        # For m=6: r =(−0.19 + 0.91 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        # For m=7: r =(−0.2 + 1 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
        if dimension <= 2 and dimension <= 7:
            x = [-0.036, -0.08, -0.12, -0.16, -0.19, -0.2][dimension - 2]
            y = [0.26, 0.46, 0.62, 0.78, 0.91, 1][dimension - 2]
        else:
            # We need to extrapolate the 2 first numbers, x and y
            # np.polyfit(np.log([2,3,4, 5, 6, 7]), [-0.036, -0.08, -0.12, -0.16, -0.19, -0.2], 1)
            # np.polyfit([2,3,4, 5, 6, 7], [0.26, 0.46, 0.62, 0.78, 0.91, 1], 1)
            x = -0.034 * dimension + 0.022
            y = 0.14885714 * dimension - 0.00180952

        r = (x + y * np.sqrt(sd1 / sd2)) / (len(signal) / 1000) ** 1 / 4
        info = {"Method": "Chon (2009)"}

    elif method in ["neurokit", "makowski"] and (
        isinstance(dimension, (int, float)) or dimension is None
    ):
        # Method described in
        # https://github.com/DominiqueMakowski/ComplexityTolerance
        if dimension is None:
            raise ValueError("'dimension' cannot be empty for the 'makowski' method.")
        n = len(signal)
        r = np.std(signal, ddof=1) * (
            0.2811 * (dimension - 1)
            + 0.0049 * np.log(n)
            - 0.02 * ((dimension - 1) * np.log(n))
        )

        info = {"Method": "Makowski"}

    elif method in ["maxapen", "optimize"]:
        r, info = _optimize_tolerance_maxapen(
            signal, r_range=r_range, delay=delay, dimension=dimension
        )
        info.update({"Method": "Max ApEn"})

    elif method in ["recurrence", "rqa"]:
        r, info = _optimize_tolerance_recurrence(
            signal, r_range=r_range, delay=delay, dimension=dimension
        )
        info.update({"Method": "1% Recurrence Rate"})

    elif method in ["neighbours", "neighbors", "nn"]:
        r, info = _optimize_tolerance_neighbours(
            signal, r_range=r_range, delay=delay, dimension=dimension
        )
        info.update({"Method": "2% Neighbours"})

    elif method in ["bin", "bins", "singh", "singh2016"]:
        r, info = _optimize_tolerance_bin(signal, delay=delay, dimension=dimension)
        info.update({"Method": "bin"})

    else:
        raise ValueError(
            "NeuroKit error: complexity_tolerance(): 'method' not recognized."
        )

    if show is True:
        _optimize_tolerance_plot(r, info, method=method, signal=signal)
    return r, info



# =============================================================================
# Internals
# =============================================================================


def _optimize_tolerance_recurrence(signal, r_range=None, delay=None, dimension=None):
    # Optimize missing parameters
    if delay is None or dimension is None:
        raise ValueError(
            "If method='recurrence', both delay and dimension must be specified."
        )

    # Compute distance matrix
    emb = complexity_embedding(signal, delay=delay, dimension=dimension)
    d = scipy.spatial.distance.cdist(emb, emb, metric="euclidean")

    if r_range is None:
        r_range = 50
    if isinstance(r_range, int):
        r_range = np.linspace(0, np.max(d), r_range + 1)[1:]

    recurrence_rate = np.zeros_like(r_range)
    # Indices of the lower triangular (without the diagonal)
    idx = np.tril_indices(len(d), k=-1)
    n = len(d[idx])
    for i, r in enumerate(r_range):
        recurrence_rate[i] = (d[idx] <= r).sum() / n
    # Closest to 0.01 (1%)
    optimal = r_range[np.abs(recurrence_rate - 0.01).argmin()]

    return optimal, {"Values": r_range, "Scores": recurrence_rate}


def _optimize_tolerance_maxapen(signal, r_range=None, delay=None, dimension=None):
    # Optimize missing parameters
    if delay is None or dimension is None:
        raise ValueError(
            "If method='maxApEn', both delay and dimension must be specified."
        )

    if r_range is None:
        r_range = 40
    if isinstance(r_range, int):
        r_range = np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)

    apens = np.zeros_like(r_range)
    info = {"kdtree1": None, "kdtree2": None}
    for i, r in enumerate(r_range):
        apens[i], info = _entropy_apen(
            signal,
            delay=delay,
            dimension=dimension,
            tolerance=r,
            kdtree1=info["kdtree1"],
            kdtree2=info["kdtree2"],
        )
    # apens = [_entropy_apen(signal, delay=delay, dimension=dimension, tolerance=r) for r in r_range]

    return r_range[np.argmax(apens)], {"Values": r_range, "Scores": np.array(apens)}


def _optimize_tolerance_neighbours(signal, r_range=None, delay=None, dimension=None):
    if delay is None:
        delay = 1
    if dimension is None:
        dimension = 1
    if r_range is None:
        r_range = 50
    if isinstance(r_range, int):
        r_range = np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)

    embedded = complexity_embedding(signal, delay=delay, dimension=dimension)

    kdtree = sklearn.neighbors.KDTree(embedded, metric="chebyshev")
    counts = np.array(
        [
            np.mean(
                kdtree.query_radius(embedded, r, count_only=True).astype(np.float64)
                / embedded.shape[0]
            )
            for r in r_range
        ]
    )
    # Closest to 0.02 (2%)
    optimal = r_range[np.abs(counts - 0.02).argmin()]
    return optimal, {"Values": r_range, "Scores": counts}


def _optimize_tolerance_bin(signal, delay=None, dimension=None):
    # Optimize missing parameters
    if delay is None or dimension is None:
        raise ValueError("If method='bin', both delay and dimension must be specified.")

    # Compute distance matrix
    emb = complexity_embedding(signal, delay=delay, dimension=dimension)
    d = scipy.spatial.distance.cdist(emb, emb, metric="chebyshev")

    # Histogram of the lower triangular (without the diagonal)
    y, x = np.histogram(d[np.tril_indices(len(d), k=-1)], bins=200, density=True)

    # Most common distance
    # Divide by two because r corresponds to the radius of the circle (NOTE: this is
    # NOT in the paper and thus, opinion is required!)
    optimal = x[np.argmax(y) + 1] / 2

    return optimal, {"Values": x[1::] / 2, "Scores": y}


# =============================================================================
# Plotting
# =============================================================================
def _optimize_tolerance_plot(r, info, ax=None, method="maxApEn", signal=None):
    if ax is None:
        fig, ax = plt.subplots()
    else:
        fig = None

    if method in [
        "traditional",
        "sd",
        "std",
        "default",
        "none",
        "adjusted_sd",
        "nolds",
        "chon",
        "chon2009",
    ]:
        x, y = density(signal)
        arrow_y = np.mean([np.max(y), np.min(y)])
        x_range = np.max(x) - np.min(x)
        ax.plot(x, y, color="#80059c", label="Optimal r: " + str(np.round(r, 3)))
        ax.arrow(
            np.mean(x),
            arrow_y,
            np.mean(x) + r / 2,
            0,
            head_width=0.01 * x_range,
            head_length=0.01 * x_range,
            linewidth=4,
            color="g",
            length_includes_head=True,
        )
        ax.arrow(
            np.mean(x),
            arrow_y,
            np.mean(x) - r / 2,
            0,
            head_width=0.01 * x_range,
            head_length=0.01 * x_range,
            linewidth=4,
            color="g",
            length_includes_head=True,
        )
        ax.set_title("Optimization of Tolerance Threshold (r)")
        ax.set_xlabel("Signal values")
        ax.set_ylabel("Distribution")
        ax.legend(loc="upper right")
        return fig

    if method in ["bin", "bins", "singh", "singh2016"]:
        ax.set_title("Optimization of Tolerance Threshold (r)")
        ax.set_xlabel("Chebyshev Distance")
        ax.set_ylabel("Density")
        ax.plot(info["Values"], info["Scores"], color="#4CAF50")
        ax.axvline(x=r, color="#E91E63", label="Optimal r: " + str(np.round(r, 3)))
        ax.legend(loc="upper right")

        return fig

    r_range = info["Values"]
    y_values = info["Scores"]

    # Custom legend depending on method
    if method in ["maxapen", "optimize"]:
        ylabel = "Approximate Entropy $ApEn$"
        legend = "$ApEn$"
    else:
        y_values *= 100  # Convert to percentage
        ax.axhline(y=0.5, color="grey")
        ax.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter())
        if method in ["neighbours", "neighbors", "nn"]:
            ylabel = "Nearest Neighbours"
            legend = "$NN$"
        else:
            ylabel = "Recurrence Rate $RR$"
            legend = "$RR$"

    ax.set_title("Optimization of Tolerance Threshold (r)")
    ax.set_xlabel("Tolerance threshold $r$")
    ax.set_ylabel(ylabel)
    ax.plot(r_range, y_values, "o-", label=legend, color="#80059c")
    ax.axvline(x=r, color="#E91E63", label="Optimal r: " + str(np.round(r, 3)))
    ax.legend(loc="upper right")

    return fig