Source code for neurokit2.misc.find_outliers

import numpy as np
import scipy

from ..stats import standardize



[docs]
def find_outliers(data, exclude=2, side="both", method="sd", **kwargs):
    """**Identify outliers (abnormal values)**

    Extreme values identification using different methods, such as:

    * **sd**: Data is :func:`standardized <.standardize>`, i.e., centered and
      scaled, and absolute value beyond a certain SD threshold are considered as outliers.
    * **norm**: Extreme values identified using theoretical percentiles to identify outliers
      beyond a certain theoretical percentile (assuming the data comes from a normal distribution).
      For example, with this method, ``exclude=0.025`` (one-sided) corresponds to the 2.5% lower
      bound of the normal distribution, which corresponds to approx. -1.96 SD. This method is
      related to the **SD** one, but instead of specifying the threshold in SDs, it is specified in
      percentiles.
    * **percentile**: Extreme values identified using percentiles.

    Parameters
    ----------
    data : list or ndarray
        Data array
    exclude : int, float
        Amount of outliers to detect (depends on the chosen method).
    side: str
        Can be ``"both"``, ``"left"`` or ``"right"``. If ``exclude=0.05`` and ``side="both"`` and
        ``method="norm"``, 2.5% of extreme observation of each side will be marked as outliers.
    method: str
        Can be "standardize" or "percentile". The default is "standardize".
    **kwargs : optional
        Other arguments to be passed to :func:`standardize`.

    Returns
    ----------
    outliers : ndarray
        A boolean vector of with ``True`` being the outliers.

    See Also
    ----------
    .standardize

    Example
    ----------
    .. ipython:: python

      import neurokit2 as nk

      data = [-12, 2, 1, 3, 66.6, 2, 1, 3, 2, -42, 2, 4, 1, 12]

      # Outliers beyond 2 SD of the mean
      outliers = nk.find_outliers(data, exclude=2, side="both", method="sd")
      np.where(outliers)[0]

      # Outliers beyond 1 MAD of the Median on one side
      outliers = nk.find_outliers(data, exclude=1, side="left", method="sd", robust=True)
      np.where(outliers)[0]

      # 2.5% theoretical percentiles on each side
      outliers = nk.find_outliers(data, exclude=0.05, method="norm")
      np.where(outliers)[0]

      # Outliers are beyond interquartile range
      outliers = nk.find_outliers(data, exclude=(0.25, 0.75), method="percentile")
      np.where(outliers)[0]

      # Outliers are beyond interdecile range
      outliers = nk.find_outliers(data, exclude=(0.1, 0.9), method="percentile")
      np.where(outliers)[0]

    """
    # Sanity checks
    if side not in ["both", "left", "right"]:
        raise ValueError("side must be 'both', 'left' or 'right'.")

    method = method.lower()
    if method not in ["standardize", "z", "sd", "percentile", "norm"]:
        raise ValueError("method must be 'standardize' or 'percentile'.")

    # Force array
    data = np.array(data)

    # Find thresholds
    if method in ["percentile"]:
        if isinstance(exclude, (list, tuple, np.ndarray)):
            right = np.percentile(data, exclude[1] * 100)
            left = np.percentile(data, exclude[0] * 100)
        else:
            right = np.percentile(data, (1 - (exclude / 2)) * 100)
            left = np.percentile(data, (exclude / 2) * 100)

    elif method in ["sd"]:
        if isinstance(exclude, (list, tuple, np.ndarray)):
            right = exclude[1]
            left = exclude[0]
        else:
            right = exclude
            left = -right
    else:
        if side == "both":
            exclude = exclude / 2
        right = scipy.stats.norm.ppf(1 - exclude)
        left = -right

    if method in ["standardize", "z", "sd", "norm"]:
        data = np.array(standardize(data, **kwargs))

    if side == "both":
        outliers = (data < left) | (data > right)
    elif side == "left":
        outliers = data < left
    elif side == "right":
        outliers = data > right

    return outliers