# Source code for neurokit2.complexity.optim_complexity_tolerance

```# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import scipy.spatial
import sklearn.neighbors

from ..stats import density
from .utils_complexity_embedding import complexity_embedding
from .utils_entropy import _entropy_apen

[docs]
def complexity_tolerance(
signal, method="maxApEn", r_range=None, delay=None, dimension=None, show=False
):
"""**Automated selection of tolerance (r)**

Estimate and select the optimal tolerance (*r*) parameter used by other entropy and other
complexity algorithms.

Many complexity algorithms are built on the notion of self-similarity and recurrence, and how
often a system revisits its past states. Considering two states as identical is straightforward
for discrete systems (e.g., a sequence of ``"A"``, ``"B"`` and ``"C"`` states), but for
continuous signals, we cannot simply look for when the two numbers are exactly the same.
Instead, we have to pick a threshold by which to consider two points as similar.

The tolerance *r* is essentially this threshold value (the numerical difference between two
similar points that we "tolerate"). This parameter has a critical impact and is a major
source of inconsistencies in the literature.

Different methods have been described to estimate the most appropriate tolerance value:

* **maxApEn**: Different values of tolerance will be tested and the one where the approximate
entropy (ApEn) is maximized will be selected and returned (Chen, 2008).
* **recurrence**: The tolerance that yields a recurrence rate (see ``RQA``) close to 1% will
be returned. Note that this method is currently not suited for very long signals, as it is
based on a recurrence matrix, which size is close to n^2. Help is needed to address this
limitation.
* **neighbours**: The tolerance that yields a number of nearest neighbours (NN) close to 2% will
be returned.

As these methods are computationally expensive, other fast heuristics are available:

* **sd**: r = 0.2 * standard deviation (SD) of the signal will be returned. This is the most
commonly used value in the literature, though its appropriateness is questionable.
* **makowski**: Adjusted value based on the SD, the embedding dimension and the signal's
length. See our `study <https://github.com/DominiqueMakowski/ComplexityTolerance>`_.
* **nolds**: Adjusted value based on the SD and the dimension. The rationale is that
the chebyshev distance (used in various metrics) rises logarithmically with increasing
dimension. ``0.5627 * np.log(dimension) + 1.3334`` is the logarithmic trend line for the
chebyshev distance of vectors sampled from a univariate normal distribution. A constant of
``0.1164`` is used so that ``tolerance = 0.2 * SDs`` for ``dimension = 2`` (originally in
https://github.com/CSchoel/nolds).
* **singh2016**: Makes a histogram of the Chebyshev distance matrix and returns the upper bound
of the modal bin.
* **chon2009**: Acknowledging that computing multiple ApEns is computationally expensive, Chon
(2009) suggested an approximation based a heuristic algorithm that takes into account the
length of the signal, its short-term and long-term variability, and the embedding dimension
*m*. Initially defined only for *m* in [2-7], we expanded this to work with value of *m*
(though the accuracy is not guaranteed beyond *m* = 4).

Parameters
----------
signal : Union[list, np.array, pd.Series]
The signal (i.e., a time series) in the form of a vector of values.
method : str
Can be ``"maxApEn"`` (default), ``"sd"``, ``"recurrence"``, ``"neighbours"``, ``"nolds"``,
``"chon2009"``, or ``"neurokit"``.
r_range : Union[list, int]
The range of tolerance values (or the number of values) to test. Only used if ``method`` is
``"maxApEn"`` or ``"recurrence"``. If ``None`` (default), the default range will be used;
``np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)`` for ``"maxApEn"``, and ``np.
linspace(0, np.max(d), 30 + 1)[1:]`` for ``"recurrence"``. You can set a lower number for
faster results.
delay : int
Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
dimension : int
Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
show : bool
If ``True`` and method is ``"maxApEn"``, will plot the ApEn values for each value of r.

--------
complexity, complexity_delay, complexity_dimension, complexity_embedding

Returns
----------
float
The optimal tolerance value.
dict

Examples
----------
* **Example 1**: The method based on the SD of the signal is fast. The plot shows the d
distribution of the values making the signal, and the width of the arrow represents the
chosen ``r`` parameter.

.. ipython:: python

import neurokit2 as nk

# Simulate signal
signal = nk.signal_simulate(duration=2, frequency=[5, 7, 9, 12, 15])

# Fast method (based on the standard deviation)
@savefig p_complexity_tolerance1.png scale=100%
r, info = nk.complexity_tolerance(signal, method = "sd", show=True)
@suppress
plt.close()

.. ipython:: python

r

The dimension can be taken into account:
.. ipython:: python

# nolds method
@savefig p_complexity_tolerance2.png scale=100%
r, info = nk.complexity_tolerance(signal, method = "nolds", dimension=3, show=True)
@suppress
plt.close()

.. ipython:: python

r

* **Example 2**: The method based on the recurrence rate will display the rates according to
different values of tolerance. The horizontal line indicates 5%.

.. ipython:: python

@savefig p_complexity_tolerance3.png scale=100%
r, info = nk.complexity_tolerance(signal, delay=1, dimension=10,
method = 'recurrence', show=True)
@suppress
plt.close()

.. ipython:: python

r

An alternative, better suited for long signals is to use nearest neighbours.

.. ipython:: python

@savefig p_complexity_tolerance4.png scale=100%
r, info = nk.complexity_tolerance(signal, delay=1, dimension=10,
method = 'neighbours', show=True)
@suppress
plt.close()

Another option is to use the density of distances.

.. ipython:: python

@savefig p_complexity_tolerance5.png scale=100%
r, info = nk.complexity_tolerance(signal, delay=1, dimension=3,
method = 'bin', show=True)
@suppress
plt.close()

* **Example 3**: The default method selects the tolerance at which *ApEn* is maximized.

.. ipython:: python

# Slow method
@savefig p_complexity_tolerance6.png scale=100%
r, info = nk.complexity_tolerance(signal, delay=8, dimension=6,
method = 'maxApEn', show=True)
@suppress
plt.close()

.. ipython:: python

r

* **Example 4**: The tolerance values that are tested can be modified to get a more precise
estimate.

.. ipython:: python

# Narrower range
@savefig p_complexity_tolerance7.png scale=100%
r, info = nk.complexity_tolerance(signal, delay=8, dimension=6, method = 'maxApEn',
r_range=np.linspace(0.002, 0.8, 30), show=True)
@suppress
plt.close()

.. ipython:: python

r

References
-----------
* Chon, K. H., Scully, C. G., & Lu, S. (2009). Approximate entropy for all signals. IEEE
engineering in medicine and biology magazine, 28(6), 18-23.
* Lu, S., Chen, X., Kanters, J. K., Solomon, I. C., & Chon, K. H. (2008). Automatic selection of
the threshold value r for approximate entropy. IEEE Transactions on Biomedical Engineering,
55(8), 1966-1972.
* Chen, X., Solomon, I. C., & Chon, K. H. (2008). Parameter selection criteria in approximate
entropy and sample entropy with application to neural respiratory signals. Am. J. Physiol.
Regul. Integr. Comp. Physiol.
* Singh, A., Saini, B. S., & Singh, D. (2016). An alternative approach to approximate entropy
threshold value (r) selection: application to heart rate variability and systolic blood
pressure variability under postural challenge. Medical & biological engineering & computing,
54(5), 723-732.

"""
if not isinstance(method, str):
return method, {"Method": "None"}

# Method
method = method.lower()
if method in ["traditional", "sd", "std", "default"]:
r = 0.2 * np.std(signal, ddof=1)
info = {"Method": "20% SD"}

elif method in ["adjusted_sd", "nolds"] and (
isinstance(dimension, (int, float)) or dimension is None
):
if dimension is None:
raise ValueError("'dimension' cannot be empty for the 'nolds' method.")
r = (
0.11604738531196232
* np.std(signal, ddof=1)
* (0.5627 * np.log(dimension) + 1.3334)
)
info = {"Method": "Adjusted 20% SD"}

elif method in ["chon", "chon2009"] and (
isinstance(dimension, (int, float)) or dimension is None
):
if dimension is None:
raise ValueError("'dimension' cannot be empty for the 'chon2009' method.")
sd1 = np.std(np.diff(signal), ddof=1)  # short-term variability
sd2 = np.std(signal, ddof=1)  # long-term variability of the signal

# Here are the 3 formulas from Chon (2009):
# For m=2: r =(−0.036 + 0.26 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
# For m=3: r =(−0.08 + 0.46 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
# For m=4: r =(−0.12 + 0.62 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
# For m=5: r =(−0.16 + 0.78 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
# For m=6: r =(−0.19 + 0.91 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
# For m=7: r =(−0.2 + 1 * sqrt(sd1/sd2)) / (len(signal) / 1000)**1/4
if dimension <= 2 and dimension <= 7:
x = [-0.036, -0.08, -0.12, -0.16, -0.19, -0.2][dimension - 2]
y = [0.26, 0.46, 0.62, 0.78, 0.91, 1][dimension - 2]
else:
# We need to extrapolate the 2 first numbers, x and y
# np.polyfit(np.log([2,3,4, 5, 6, 7]), [-0.036, -0.08, -0.12, -0.16, -0.19, -0.2], 1)
# np.polyfit([2,3,4, 5, 6, 7], [0.26, 0.46, 0.62, 0.78, 0.91, 1], 1)
x = -0.034 * dimension + 0.022
y = 0.14885714 * dimension - 0.00180952

r = (x + y * np.sqrt(sd1 / sd2)) / (len(signal) / 1000) ** 1 / 4
info = {"Method": "Chon (2009)"}

elif method in ["neurokit", "makowski"] and (
isinstance(dimension, (int, float)) or dimension is None
):
# Method described in
# https://github.com/DominiqueMakowski/ComplexityTolerance
if dimension is None:
raise ValueError("'dimension' cannot be empty for the 'makowski' method.")
n = len(signal)
r = np.std(signal, ddof=1) * (
0.2811 * (dimension - 1)
+ 0.0049 * np.log(n)
- 0.02 * ((dimension - 1) * np.log(n))
)

info = {"Method": "Makowski"}

elif method in ["maxapen", "optimize"]:
r, info = _optimize_tolerance_maxapen(
signal, r_range=r_range, delay=delay, dimension=dimension
)
info.update({"Method": "Max ApEn"})

elif method in ["recurrence", "rqa"]:
r, info = _optimize_tolerance_recurrence(
signal, r_range=r_range, delay=delay, dimension=dimension
)
info.update({"Method": "1% Recurrence Rate"})

elif method in ["neighbours", "neighbors", "nn"]:
r, info = _optimize_tolerance_neighbours(
signal, r_range=r_range, delay=delay, dimension=dimension
)
info.update({"Method": "2% Neighbours"})

elif method in ["bin", "bins", "singh", "singh2016"]:
r, info = _optimize_tolerance_bin(signal, delay=delay, dimension=dimension)
info.update({"Method": "bin"})

else:
raise ValueError(
"NeuroKit error: complexity_tolerance(): 'method' not recognized."
)

if show is True:
_optimize_tolerance_plot(r, info, method=method, signal=signal)
return r, info

# =============================================================================
# Internals
# =============================================================================

def _optimize_tolerance_recurrence(signal, r_range=None, delay=None, dimension=None):
# Optimize missing parameters
if delay is None or dimension is None:
raise ValueError(
"If method='recurrence', both delay and dimension must be specified."
)

# Compute distance matrix
emb = complexity_embedding(signal, delay=delay, dimension=dimension)
d = scipy.spatial.distance.cdist(emb, emb, metric="euclidean")

if r_range is None:
r_range = 50
if isinstance(r_range, int):
r_range = np.linspace(0, np.max(d), r_range + 1)[1:]

recurrence_rate = np.zeros_like(r_range)
# Indices of the lower triangular (without the diagonal)
idx = np.tril_indices(len(d), k=-1)
n = len(d[idx])
for i, r in enumerate(r_range):
recurrence_rate[i] = (d[idx] <= r).sum() / n
# Closest to 0.01 (1%)
optimal = r_range[np.abs(recurrence_rate - 0.01).argmin()]

return optimal, {"Values": r_range, "Scores": recurrence_rate}

def _optimize_tolerance_maxapen(signal, r_range=None, delay=None, dimension=None):
# Optimize missing parameters
if delay is None or dimension is None:
raise ValueError(
"If method='maxApEn', both delay and dimension must be specified."
)

if r_range is None:
r_range = 40
if isinstance(r_range, int):
r_range = np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)

apens = np.zeros_like(r_range)
info = {"kdtree1": None, "kdtree2": None}
for i, r in enumerate(r_range):
apens[i], info = _entropy_apen(
signal,
delay=delay,
dimension=dimension,
tolerance=r,
kdtree1=info["kdtree1"],
kdtree2=info["kdtree2"],
)
# apens = [_entropy_apen(signal, delay=delay, dimension=dimension, tolerance=r) for r in r_range]

return r_range[np.argmax(apens)], {"Values": r_range, "Scores": np.array(apens)}

def _optimize_tolerance_neighbours(signal, r_range=None, delay=None, dimension=None):
if delay is None:
delay = 1
if dimension is None:
dimension = 1
if r_range is None:
r_range = 50
if isinstance(r_range, int):
r_range = np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)

embedded = complexity_embedding(signal, delay=delay, dimension=dimension)

kdtree = sklearn.neighbors.KDTree(embedded, metric="chebyshev")
counts = np.array(
[
np.mean(
/ embedded.shape[0]
)
for r in r_range
]
)
# Closest to 0.02 (2%)
optimal = r_range[np.abs(counts - 0.02).argmin()]
return optimal, {"Values": r_range, "Scores": counts}

def _optimize_tolerance_bin(signal, delay=None, dimension=None):
# Optimize missing parameters
if delay is None or dimension is None:
raise ValueError("If method='bin', both delay and dimension must be specified.")

# Compute distance matrix
emb = complexity_embedding(signal, delay=delay, dimension=dimension)
d = scipy.spatial.distance.cdist(emb, emb, metric="chebyshev")

# Histogram of the lower triangular (without the diagonal)
y, x = np.histogram(d[np.tril_indices(len(d), k=-1)], bins=200, density=True)

# Most common distance
# Divide by two because r corresponds to the radius of the circle (NOTE: this is
# NOT in the paper and thus, opinion is required!)
optimal = x[np.argmax(y) + 1] / 2

return optimal, {"Values": x[1::] / 2, "Scores": y}

# =============================================================================
# Plotting
# =============================================================================
def _optimize_tolerance_plot(r, info, ax=None, method="maxApEn", signal=None):
if ax is None:
fig, ax = plt.subplots()
else:
fig = None

if method in [
"sd",
"std",
"default",
"none",
"nolds",
"chon",
"chon2009",
]:
x, y = density(signal)
arrow_y = np.mean([np.max(y), np.min(y)])
x_range = np.max(x) - np.min(x)
ax.plot(x, y, color="#80059c", label="Optimal r: " + str(np.round(r, 3)))
ax.arrow(
np.mean(x),
arrow_y,
np.mean(x) + r / 2,
0,
linewidth=4,
color="g",
)
ax.arrow(
np.mean(x),
arrow_y,
np.mean(x) - r / 2,
0,
linewidth=4,
color="g",
)
ax.set_title("Optimization of Tolerance Threshold (r)")
ax.set_xlabel("Signal values")
ax.set_ylabel("Distribution")
ax.legend(loc="upper right")
return fig

if method in ["bin", "bins", "singh", "singh2016"]:
ax.set_title("Optimization of Tolerance Threshold (r)")
ax.set_xlabel("Chebyshev Distance")
ax.set_ylabel("Density")
ax.plot(info["Values"], info["Scores"], color="#4CAF50")
ax.axvline(x=r, color="#E91E63", label="Optimal r: " + str(np.round(r, 3)))
ax.legend(loc="upper right")

return fig

r_range = info["Values"]
y_values = info["Scores"]

# Custom legend depending on method
if method in ["maxapen", "optimize"]:
ylabel = "Approximate Entropy \$ApEn\$"
legend = "\$ApEn\$"
else:
y_values *= 100  # Convert to percentage
ax.axhline(y=0.5, color="grey")
ax.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter())
if method in ["neighbours", "neighbors", "nn"]:
ylabel = "Nearest Neighbours"
legend = "\$NN\$"
else:
ylabel = "Recurrence Rate \$RR\$"
legend = "\$RR\$"

ax.set_title("Optimization of Tolerance Threshold (r)")
ax.set_xlabel("Tolerance threshold \$r\$")
ax.set_ylabel(ylabel)
ax.plot(r_range, y_values, "o-", label=legend, color="#80059c")
ax.axvline(x=r, color="#E91E63", label="Optimal r: " + str(np.round(r, 3)))
ax.legend(loc="upper right")

return fig
```