# Source code for neurokit2.complexity.utils_complexity_symbolize

```# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.vq
import scipy.special

from ..misc import check_random_state
from ..stats import standardize
from .optim_complexity_tolerance import complexity_tolerance

[docs]
def complexity_symbolize(signal, method="mean", c=3, random_state=None, show=False, **kwargs):
"""**Signal Symbolization and Discretization**

Many complexity indices are made to assess the recurrence and predictability of discrete -
symbolic - states. As such, continuous signals must be transformed into such discrete sequence.

For instance, one of the easiest way is to split the signal values into two categories, above
and below the mean, resulting in a sequence of *A* and *B*. More complex methods have been
developped to that end.

* **Method 'A'** binarizes the signal by higher vs. lower values as compated to the signal's
mean. Equivalent tp ``method="mean"`` (``method="median"`` is also valid).
* **Method 'B'** uses values that are within the mean +/- 1 SD band vs. values that are outside
this band.
* **Method 'C'** computes the difference between consecutive samples and binarizes depending on
their sign.
* **Method 'D'** forms separates consecutive samples that exceed 1 signal's SD from the others
smaller changes.
* **Method 'r'** is based on the concept of :func:`*tolerance* <complexity_tolerance>`, and
will separate consecutive samples that exceed a given tolerance threshold, by default
:math:`0.2 * SD`. See :func:`complexity_tolerance` for more details.
* **Binning**: If an integer *n* is passed, will bin the signal into *n* equal-width bins.
Requires to specify *c*.
* **MEP**: Maximum Entropy Partitioning. Requires to specify *c*.
* **kmeans**: k-means clustering. Requires to specify *c*.

Parameters
----------
signal : Union[list, np.array, pd.Series]
The signal (i.e., a time series) in the form of a vector of values.
method : str or int
Method of symbolization. Can be one of ``"A"`` (default), ``"B"``, ``"C"``, ``"D"``,
``"r"``, ``"Binning"``, ``"MEP"``, ``"NCDF"``, ``"linear"``, ``"uniform"``, ``"kmeans"``,
``"equal"``, or ``None`` to skip the process (for instance, in cases when the binarization

See :func:`complexity_symbolize` for details.
c : int
Number of symbols *c*, used in some algorithms.
random_state : None, int, numpy.random.RandomState or numpy.random.Generator
Seed for the random number generator. See :func:`misc.check_random_state` for further information.
show : bool
Plot the reconstructed attractor. See :func:`complexity_attractor` for details.
**kwargs
Other arguments to be passed to :func:`complexity_attractor`.

Returns
-------
array
A symbolic sequence made of discrete states (e.g., 0 and 1).

------------
entropy_shannon, entropy_cumulative_residual, fractal_petrosian

Examples
---------
.. ipython:: python

import neurokit2 as nk

signal = nk.signal_simulate(duration=2, frequency=[5, 12])

# Method "A" is equivalent to "mean"
@savefig p_complexity_symbolize1.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "A", show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize2.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "B", show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize3.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "C", show=True)
@suppress
plt.close()

.. ipython:: python

signal = nk.signal_simulate(duration=2, frequency=[5], noise = 0.1)

@savefig p_complexity_symbolize4.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "D", show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize5.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "r", show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize6.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "binning", c=3, show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize7.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "MEP", c=3, show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize8.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "NCDF", c=3, show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize9.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "linear", c=5, show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize10.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "equal", c=5, show=True)
@suppress
plt.close()

.. ipython:: python

@savefig p_complexity_symbolize11.png scale=100%
symbolic = nk.complexity_symbolize(signal, method = "kmeans", c=5, random_state=42, show=True)
@suppress
plt.close()

"""
# Seed the random generator for reproducible results
rng = check_random_state(random_state)

# Do nothing
if method is None:
symbolic = signal
if show is True:
df = pd.DataFrame({"Signal": signal, "Bin": signal, "Index": np.arange(len(signal))})
df = df.pivot_table(index="Index", columns="Bin", values="Signal")
for i in df.columns:
plt.plot(df[i])

# Binnning
elif isinstance(method, int):
c = method
method = "binning"

if isinstance(method, str):
method = method.lower()

if method in ["a", "mean"]:
symbolic = (signal > np.nanmean(signal)).astype(int)
if show is True:
df = pd.DataFrame({"A": signal, "B": signal})
df.loc[df["A"] > np.nanmean(signal), "A"] = np.nan
df.loc[df["B"] <= np.nanmean(signal), "B"] = np.nan
df.plot()
plt.axhline(y=np.nanmean(signal), color="r", linestyle="dotted")
plt.title("Method A")

elif method == "median":
symbolic = (signal > np.nanmedian(signal)).astype(int)
if show is True:
df = pd.DataFrame({"A": signal, "B": signal})
df.loc[df["A"] > np.nanmedian(signal), "A"] = np.nan
df.loc[df["B"] <= np.nanmedian(signal), "B"] = np.nan
df.plot()
plt.axhline(y=np.nanmean(signal), color="r", linestyle="dotted")
plt.title("Binarization by median")

elif method == "b":
m = np.nanmean(signal)
sd = np.nanstd(signal, ddof=1)
symbolic = np.logical_or(signal < m - sd, signal > m + sd).astype(int)
if show is True:
df = pd.DataFrame({"A": signal, "B": signal})
condition = np.logical_or(signal < m - sd, signal > m + sd)
df.loc[condition, "A"] = np.nan
df.loc[~np.isnan(df["A"]), "B"] = np.nan
df.plot()
plt.axhline(y=m - sd, color="r", linestyle="dotted")
plt.axhline(y=m + sd, color="r", linestyle="dotted")
plt.title("Method B")

elif method in ["c", "sign"]:
symbolic = np.signbit(np.diff(signal)).astype(int)
if show is True:
df = pd.DataFrame({"A": signal, "B": signal})
df.loc[np.insert(symbolic, 0, False), "A"] = np.nan
df.loc[~np.isnan(df["A"]), "B"] = np.nan
df.plot()
plt.title("Method C")

elif method == "d":
symbolic = (np.abs(np.diff(signal)) > np.nanstd(signal, ddof=1)).astype(int)
if show is True:
where = np.where(symbolic)[0]
plt.plot(signal, zorder=1 == 1)
plt.scatter(where, signal[where], color="orange", label="Inversion", zorder=2)
plt.title("Method D")

elif method == "r":
symbolic = np.abs(np.diff(signal)) > complexity_tolerance(signal, method="sd")[0]
symbolic = symbolic.astype(int)
if show is True:
where = np.where(symbolic == 1)[0]
plt.plot(signal, zorder=1)
plt.scatter(where, signal[where], color="orange", label="Inversion", zorder=2)
plt.title("Method based on tolerance r")

elif method in ["binning", "mep", "ncdf", "linear", "uniform", "kmeans", "equal"]:
n = len(signal)
if method == "binning":
symbolic = pd.cut(signal, bins=c, labels=False)

elif method == "mep":
Temp = np.hstack((0, np.ceil(np.arange(1, c) * len(signal) / c) - 1)).astype(int)
symbolic = np.digitize(signal, np.sort(signal)[Temp])
elif method == "ncdf":
symbolic = np.digitize(
scipy.special.ndtr(standardize(signal)), np.arange(0, 1, 1 / c)
)
elif method == "linear":
symbolic = np.digitize(
signal, np.arange(np.min(signal), np.max(signal), np.ptp(signal) / c)
)
elif method == "uniform":
symbolic = np.zeros(len(signal))
symbolic[np.argsort(signal)] = np.digitize(np.arange(n), np.arange(0, 2 * n, n / c))
elif method == "kmeans":
centroids, labels = scipy.cluster.vq.kmeans2(signal, c, seed=rng)
labels += 1
xx = np.argsort(centroids) + 1
symbolic = np.zeros(n)
for k in range(1, c + 1):
symbolic[labels == xx[k - 1]] = k
elif method == "equal":
ix = np.argsort(signal)
xx = np.round(np.arange(0, 2 * n, n / c)).astype(int)
symbolic = np.zeros(n)
for k in range(c):
symbolic[ix[xx[k] : xx[k + 1]]] = k + 1

if show is True:
df = pd.DataFrame(
{"Signal": signal, "Bin": symbolic, "Index": np.arange(len(signal))}
)
df = df.pivot_table(index="Index", columns="Bin", values="Signal")
for i in df.columns:
plt.plot(df[i])
plt.title(f"Method: {method} (c={c})")

else:
raise ValueError(
"`method` must be one of 'A', 'B', 'C' or 'D', 'Binning', 'MEP', 'NCDF', 'linear',"