Source code for neurokit2.complexity.optim_complexity_dimension

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial

from .fractal_correlation import fractal_correlation
from .utils_complexity_embedding import complexity_embedding



[docs]
def complexity_dimension(signal, delay=1, dimension_max=20, method="afnn", show=False, **kwargs):
    """**Automated selection of the optimal Embedding Dimension (m)**

    The Embedding Dimension (*m*, sometimes referred to as *d* or *order*) is the second
    critical parameter (the first being the :func:`delay <complexity_delay>` :math:`\\tau`)
    involved in the construction of the time-delay embedding of a signal. It corresponds to the
    number of delayed states (versions of the signals lagged by :math:`\\tau`) that we include in
    the embedding.

    Though one can commonly find values of 2 or 3 used in practice, several authors suggested
    different numerical methods to guide the choice of *m*:

    * **Correlation Dimension** (CD): One of the earliest method to estimate the optimal *m*
      was to calculate the :func:`correlation dimension <fractal_correlation>` for embeddings of
      various sizes and look for a saturation (i.e., a plateau) in its value as the embedding
      dimension increases. One of the limitation is that a saturation will also occur when there is
      not enough data to adequately fill the high-dimensional space (note that, in general, having
      such large embeddings that it significantly shortens the length of the signal is not
      recommended).
    * **FNN** (False Nearest Neighbour): The method, introduced by Kennel et al. (1992), is based
      on the assumption that two points that are near to each other in the sufficient embedding
      dimension should remain close as the dimension increases. The algorithm checks the neighbours
      in increasing embedding dimensions until it finds only a negligible number of false
      neighbours when going from dimension :math:`m` to :math:`m+1`. This corresponds to the lowest
      embedding dimension, which is presumed to give an unfolded space-state reconstruction. This
      method can fail in noisy signals due to the futile attempt of unfolding the noise (and in
      purely random signals, the amount of false neighbors does not substantially drops as *m*
      increases). The **figure** below show how projections to higher-dimensional spaces can be
      used to detect false nearest neighbours. For instance, the red and the yellow points are
      neighbours in the 1D space, but not in the 2D space.

    .. figure:: ../img/douglas2022b.png
       :alt: Illustration of FNN (Douglas et al., 2022).

    * **AFN** (Average False Neighbors): This modification by Cao (1997) of the FNN method
      addresses one of its main drawback, the need for a heuristic choice for the tolerance
      thresholds *r*. It uses the maximal Euclidian distance to represent nearest neighbors, and
      averages all ratios of the distance in :math:`m+1` to :math:`m` dimension and defines *E1* and
      *E2* as parameters. The optimal dimension corresponds to when *E1* stops changing (reaches a
      plateau). E1 reaches a plateau at a dimension *d0* if the signal comes from an attractor.
      Then *d0*+1 is the optimal minimum embedding dimension. *E2* is a useful quantity to
      distinguish deterministic signals from stochastic signals. A constant *E2* close to 1 for any
      embedding dimension *d* suggests random data, since the future values are independent of the
      past values.

    Parameters
    ----------
    signal : Union[list, np.array, pd.Series]
        The signal (i.e., a time series) in the form of a vector of values.
    delay : int
        Time delay (often denoted Tau :math:`\\tau`, sometimes referred to as Lag) in samples.
        See :func:`complexity_delay()` to choose the optimal value for this parameter.
    dimension_max : int
        The maximum embedding dimension to test.
    method : str
        Can be ``"afn"`` (Average False Neighbor), ``"fnn"`` (False Nearest Neighbour), or ``"cd"``
        (Correlation Dimension).
    show : bool
        Visualize the result.
    **kwargs
        Other arguments, such as ``R=10.0`` or ``A=2.0`` (relative and absolute tolerance, only for
        ``'fnn'`` method).

    Returns
    -------
    dimension : int
        Optimal embedding dimension.
    parameters : dict
        A dictionary containing additional information regarding the parameters used
        to compute the optimal dimension.

    See Also
    ------------
    complexity, complexity_dimension, complexity_delay, complexity_tolerance

    Examples
    ---------
    .. ipython:: python

      import neurokit2 as nk

      signal = nk.signal_simulate(duration=2, frequency=[5, 7, 8], noise=0.01)

      # Correlation Dimension
      @savefig p_complexity_dimension1.png scale=100%
      optimal_dimension, info = nk.complexity_dimension(signal,
                                                        delay=20,
                                                        dimension_max=10,
                                                        method='cd',
                                                        show=True)
      @suppress
      plt.close()

    .. ipython:: python

      # FNN
      @savefig p_complexity_dimension2.png scale=100%
      optimal_dimension, info = nk.complexity_dimension(signal,
                                                        delay=20,
                                                        dimension_max=20,
                                                        method='fnn',
                                                        show=True)
      @suppress
      plt.close()

    .. ipython:: python

      # AFNN
      @savefig p_complexity_dimension3.png scale=100%
      optimal_dimension, info = nk.complexity_dimension(signal,
                                                        delay=20,
                                                        dimension_max=20,
                                                        method='afnn',
                                                        show=True)
      @suppress
      plt.close()


    References
    -----------
    * Kennel, M. B., Brown, R., & Abarbanel, H. D. (1992). Determining embedding dimension for
      phase-space reconstruction using a geometrical construction. Physical review A, 45(6), 3403.
    * Cao, L. (1997). Practical method for determining the minimum embedding dimension of a scalar
      time series. Physica D: Nonlinear Phenomena, 110(1-2), 43-50.
    * Rhodes, C., & Morari, M. (1997). The false nearest neighbors algorithm: An overview.
      Computers & Chemical Engineering, 21, S1149-S1154.
    * Krakovská, A., Mezeiová, K., & Budáčová, H. (2015). Use of false nearest neighbours for
      selecting variables and embedding parameters for state space reconstruction. Journal of
      Complex Systems, 2015.
    * Gautama, T., Mandic, D. P., & Van Hulle, M. M. (2003, April). A differential entropy based
      method for determining the optimal embedding parameters of a signal. In 2003 IEEE
      International Conference on Acoustics, Speech, and Signal Processing, 2003. Proceedings.
      (ICASSP'03). (Vol. 6, pp. VI-29). IEEE.

    """
    # Initialize vectors
    if isinstance(dimension_max, int):
        dimension_seq = np.arange(1, dimension_max + 1)
    else:
        dimension_seq = np.array(dimension_max)

    # Method
    method = method.lower()
    if method in ["afnn", "afn"]:
        # Append value (as it gets cropped afterwards anyway)
        dimension_seq = np.append(dimension_seq, [dimension_seq[-1] + 1])
        E, Es = _embedding_dimension_afn(signal, dimension_seq=dimension_seq, delay=delay, **kwargs)
        E1 = E[1:] / E[:-1]
        E2 = Es[1:] / Es[:-1]

        # To find where E1 saturates, set a threshold of difference
        # threshold = 0.1 * (np.max(E1) - np.min(E1))
        min_dimension = [i for i, x in enumerate(E1 >= 0.85 * np.nanmax(E1)) if x][0] + 1

        # To standardize the length of dimension_seq with E1 and E2
        dimension_seq = dimension_seq[:-1]

        # Store information
        info = {"Method": method, "Values": dimension_seq, "E1": E1, "E2": E2}

        if show is True:
            _embedding_dimension_plot(
                method=method,
                dimension_seq=dimension_seq,
                min_dimension=min_dimension,
                E1=E1,
                E2=E2,
            )

    elif method in ["fnn"]:
        f1, f2, f3 = _embedding_dimension_ffn(signal, dimension_seq=dimension_seq, delay=delay, **kwargs)

        min_dimension = [i for i, x in enumerate(f3 <= 1.85 * np.min(f3[np.nonzero(f3)])) if x][0]

        # Store information
        info = {"Method": method, "Values": dimension_seq, "f1": f1, "f2": f2, "f3": f3}

        if show is True:
            _embedding_dimension_plot(
                method=method,
                dimension_seq=dimension_seq,
                min_dimension=min_dimension,
                f1=f1,
                f2=f2,
                f3=f3,
            )

    elif method in ["correlation", "cd"]:
        CDs = _embedding_dimension_correlation(signal, dimension_seq, delay=delay, **kwargs)

        # Find elbow (TODO: replace by better method of elbow localization)
        min_dimension = dimension_seq[np.where(CDs >= 0.66 * np.max(CDs))[0][0]]

        # Store information
        info = {"Method": method, "Values": dimension_seq, "CD": CDs}

        if show is True:
            _embedding_dimension_plot(
                method=method,
                dimension_seq=dimension_seq,
                min_dimension=min_dimension,
                CD=CDs,
            )

    else:
        raise ValueError("NeuroKit error: complexity_dimension(): 'method' not recognized.")

    return min_dimension, info



# =============================================================================
# Methods
# =============================================================================
def _embedding_dimension_correlation(signal, dimension_seq, delay=1, **kwargs):
    """Return the Correlation Dimension (CD) for a all d in dimension_seq."""
    CDs = np.zeros(len(dimension_seq))
    for i, d in enumerate(dimension_seq):
        CDs[i] = fractal_correlation(signal, dimension=d, delay=delay, **kwargs)[0]

    return CDs


def _embedding_dimension_afn(signal, dimension_seq, delay=1, **kwargs):
    """AFN."""
    values = np.asarray(
        [_embedding_dimension_afn_d(signal, dimension, delay, **kwargs) for dimension in dimension_seq]
    ).T
    E, Es = values[0, :], values[1, :]

    return E, Es


def _embedding_dimension_afn_d(signal, dimension, delay=1, metric="chebyshev", window=10, maxnum=None, **kwargs):
    """Returns E(d) and E^*(d) for the AFN method for a single d.

    E(d) and E^*(d) will be used to calculate E1(d) and E2(d).
    E1(d) = E(d + 1)/E(d).
    E2(d) = E*(d + 1)/E*(d).

    """
    d, dist, index, y2 = _embedding_dimension_d(signal, dimension, delay, metric, window, maxnum)

    # Compute the ratio of near-neighbor distances in d + 1 over d dimension
    # Its average is E(d)
    if any(d == 0) or any(dist == 0):
        E = np.nan
        Es = np.nan
    else:
        E = np.mean(d / dist)
        # Calculate E^*(d)
        Es = np.mean(np.abs(y2[:, -1] - y2[index, -1]))

    return E, Es


def _embedding_dimension_ffn(signal, dimension_seq, delay=1, R=10.0, A=2.0, **kwargs):
    """Compute the fraction of false nearest neighbors.

    The false nearest neighbors (FNN) method described by Kennel et al.
    (1992) to calculate the minimum embedding dimension required to embed a scalar time series.

    Returns 3 vectors:

    - f1 : Fraction of neighbors classified as false by Test I.
    - f2 : Fraction of neighbors classified as false by Test II.
    - f3 : Fraction of neighbors classified as false by either Test I or Test II.

    """
    values = np.asarray(
        [_embedding_dimension_ffn_d(signal, dimension, delay, R=R, A=A, **kwargs) for dimension in dimension_seq]
    ).T
    f1, f2, f3 = values[0, :], values[1, :], values[2, :]

    return f1, f2, f3


def _embedding_dimension_ffn_d(signal, dimension, delay=1, R=10.0, A=2.0, metric="euclidean", window=10, maxnum=None):
    """Return fraction of false nearest neighbors for a single d."""
    d, dist, index, y2 = _embedding_dimension_d(signal, dimension, delay, metric, window, maxnum)

    # Find all potential false neighbors using Kennel et al.'s tests.
    dist[dist == 0] = np.nan  # assign nan to avoid divide by zero error in next line
    f1 = np.abs(y2[:, -1] - y2[index, -1]) / dist > R
    f2 = d / np.std(signal) > A
    f3 = f1 | f2

    return np.mean(f1), np.mean(f2), np.mean(f3)


# =============================================================================
# Internals
# =============================================================================
def _embedding_dimension_d(signal, dimension, delay=1, metric="chebyshev", window=10, maxnum=None):
    # We need to reduce the number of points in dimension d by tau
    # so that after reconstruction, there'll be equal number of points
    # at both dimension d as well as dimension d + 1.
    y1 = complexity_embedding(signal[:-delay], delay=delay, dimension=dimension)
    y2 = complexity_embedding(signal, delay=delay, dimension=dimension + 1)

    # Find near neighbors in dimension d.
    index, dist = _embedding_dimension_neighbors(y1, metric=metric, window=window, maxnum=maxnum)

    # Compute the near-neighbor distances in d + 1 dimension
    # TODO: is there a way to make this faster?
    d = [scipy.spatial.distance.chebyshev(i, j) for i, j in zip(y2, y2[index])]

    return np.asarray(d), dist, index, y2


def _embedding_dimension_neighbors(y, metric="chebyshev", window=0, maxnum=None, show=False):
    """Find nearest neighbors of all points in the given array. Finds the nearest neighbors of all points in the given
    array using SciPy's KDTree search.

    Parameters
    ----------
    y : ndarray
        embedded signal: N-dimensional array containing time-delayed vectors.
    delay : int
        Time delay (often denoted 'Tau', sometimes referred to as 'lag'). In practice, it is common
        to have a fixed time lag (corresponding for instance to the sampling rate; Gautama, 2003),
        or to find a suitable value using some algorithmic heuristics (see ``delay_optimal()``).
    dimension_max : int
        The maximum embedding dimension (often denoted 'm' or 'd', sometimes referred to as 'order')
        to test.
    metric : str
        Metric to use for distance computation.  Must be one of "cityblock" (aka the Manhattan metric),
        "chebyshev" (aka the maximum norm metric), or "euclidean". Defaults to 'chebyshev'.
    window : int
        Minimum temporal separation (Theiler window) that should exist between near neighbors.
        This is crucial while computing Lyapunov exponents and the correlation dimension. Defaults to 0.
    maxnum : int
        Maximum number of near neighbors that should be found for each point.
        In rare cases, when there are no neighbors that are at a nonzero distance, this will have to
        be increased (i.e., beyond 2 * window + 3). Defaults to None (optimum).
    show : bool
        Defaults to False.

    Returns
    -------
    index : array
        Array containing indices of near neighbors.
    dist : array
        Array containing near neighbor distances.

    """
    if metric == "chebyshev":
        p = np.inf
    elif metric == "cityblock":
        p = 1
    elif metric == "euclidean":
        p = 2
    else:
        raise ValueError('Unknown metric. Should be one of "cityblock", ' '"euclidean", or "chebyshev".')

    tree = scipy.spatial.cKDTree(y)  # pylint: disable=E1102
    n = len(y)

    if not maxnum:
        maxnum = (window + 1) + 1 + (window + 1)
    else:
        maxnum = max(1, maxnum)

    if maxnum >= n:
        raise ValueError("maxnum is bigger than array length.")

    # Query for k numbers of nearest neighbors
    distances, indices = tree.query(y, k=range(1, maxnum + 2), p=p)

    # Substract the first point
    valid = indices - np.tile(np.arange(n), (indices.shape[1], 1)).T

    # Remove points that are closer than min temporal separation
    valid = np.abs(valid) > window

    # Remove also self reference (d > 0)
    valid = valid & (distances > 0)

    # Get indices to keep
    valid = (np.arange(len(distances)), np.argmax(valid, axis=1))

    distances = distances[valid]
    indices = indices[(valid)]

    if show is True:
        plt.plot(indices, distances)

    return indices, distances


# =============================================================================
# Plotting
# =============================================================================


def _embedding_dimension_plot(
    method,
    dimension_seq,
    min_dimension,
    E1=None,
    E2=None,
    f1=None,
    f2=None,
    f3=None,
    CD=None,
    ax=None,
):

    if ax is None:
        fig, ax = plt.subplots()
    else:
        fig = None
    ax.set_title("Optimization of Dimension (d)")
    ax.set_xlabel("Embedding dimension $d$")

    if method in ["correlation", "cd"]:
        ax.set_ylabel("Correlation Dimension (CD)")
        ax.plot(dimension_seq, CD, "o-", label="$CD$", color="#852b01")
    else:
        ax.set_ylabel("$E_1(d)$ and $E_2(d)$")
        if method in ["afnn"]:
            ax.plot(dimension_seq, E1, "o-", label="$E_1(d)$", color="#FF5722")
            ax.plot(dimension_seq, E2, "o-", label="$E_2(d)$", color="#FFC107")

        if method in ["fnn"]:
            ax.plot(dimension_seq, 100 * f1, "o--", label="Test I", color="#FF5722")
            ax.plot(dimension_seq, 100 * f2, "^--", label="Test II", color="#f44336")
            ax.plot(dimension_seq, 100 * f3, "s-", label="Test I + II", color="#852b01")

    ax.axvline(x=min_dimension, color="#E91E63", label="Optimal dimension: " + str(min_dimension))
    ax.legend(loc="upper right")

    return fig