Source code for yasa.evaluation

"""
YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
epoch-by-epoch level or at the level of summary sleep statistics.

Analyses are influenced by the standardized framework proposed in Menghini et al., 2021, SLEEP.
See the following resources:
- https://doi.org/10.1093/sleep/zsaa170
- https://sri-human-sleep.github.io/sleep-trackers-performance
- https://github.com/SRI-human-sleep/sleep-trackers-performance
"""

import logging

import numpy as np
import pandas as pd
import scipy.stats as sps
import sklearn.metrics as skm


logger = logging.getLogger("yasa")

__all__ = [
    "EpochByEpochAgreement",
    "SleepStatsAgreement",
]


################################################################################
# EPOCH BY EPOCH
################################################################################


[docs]class EpochByEpochAgreement:
    """Evaluate agreement between two hypnograms or two collections of hypnograms.

    Evaluation includes averaged agreement scores, one-vs-rest agreement scores, agreement scores
    summarized across all sleep and summarized by sleep stage, and various plotting options to
    visualize the two hypnograms simultaneously. See examples for more detail.

    .. versionadded:: 0.7.0

    Parameters
    ----------
    ref_hyps : iterable of :py:class:`yasa.Hypnogram`
        A collection of reference hypnograms (i.e., those considered ground-truth).

        Each :py:class:`yasa.Hypnogram` in ``ref_hyps`` must have the same
        :py:attr:`~yasa.Hypnogram.scorer`.

        If a ``dict``, key values are use to generate unique sleep session IDs. If any other
        iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
        generated.
    obs_hyps : iterable of :py:class:`yasa.Hypnogram`
        A collection of observed hypnograms (i.e., those to be evaluated).

        Each :py:class:`yasa.Hypnogram` in ``obs_hyps`` must have the same
        :py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
        hypnograms in ``ref_hyps``.

        If a ``dict``, key values must match those of ``ref_hyps``.

    .. important::
        It is assumed that the order of hypnograms are the same in ``ref_hyps`` and ``obs_hyps``.
        For example, the third hypnogram in ``ref_hyps`` and ``obs_hyps`` must come from the same
        sleep session, and they must only differ in that they have different scorers.

    .. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.

    Notes
    -----
    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html

    References
    ----------
    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                      (2021). A standardized framework for testing the performance of sleep-tracking
                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                      zsaa170. https://doi.org/10.1093/sleep/zsaa170

    Examples
    --------
    >>> import yasa
    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i) for i in range(10)]
    >>> obs_hyps = [h.simulate_similar(scorer="YASA", seed=i) for i, h in enumerate(ref_hyps)]
    >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
    >>> agr = ebe.get_agreement()
    >>> agr.head(5).round(2)
              accuracy  balanced_acc  kappa   mcc  precision  recall     f1
    sleep_id
    1             0.31          0.26   0.07  0.07       0.31    0.31   0.31
    2             0.33          0.33   0.14  0.14       0.35    0.33   0.34
    3             0.35          0.24   0.06  0.06       0.35    0.35   0.35
    4             0.22          0.21   0.01  0.01       0.21    0.22   0.21
    5             0.21          0.17  -0.06 -0.06       0.20    0.21   0.21

    >>> ebe.get_agreement_bystage().head(12).round(3)
                    fbeta  precision  recall  support
    stage sleep_id
    WAKE  1         0.391      0.371   0.413    189.0
          2         0.299      0.276   0.326    184.0
          3         0.234      0.204   0.275    255.0
          4         0.268      0.285   0.252    321.0
          5         0.228      0.230   0.227    181.0
          6         0.407      0.384   0.433    284.0
          7         0.362      0.296   0.467    287.0
          8         0.298      0.519   0.209    263.0
          9         0.210      0.191   0.233    313.0
          10        0.369      0.420   0.329    362.0
    N1    1         0.185      0.185   0.185    124.0
          2         0.121      0.131   0.112    160.0

    >>> ebe.get_confusion_matrix(sleep_id=1)
    YASA   WAKE  N1   N2  N3  REM
    Human
    WAKE     78  24   50   3   34
    N1       23  23   43  15   20
    N2       60  58  183  43  139
    N3       30  10   50   5   32
    REM      19   9  121  50   78

    .. plot::

        >>> import matplotlib.pyplot as plt
        >>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
        >>> ebe.plot_hypnograms(sleep_id=10)

    .. plot::

        >>> fig, ax = plt.subplots(figsize=(6, 3))
        >>> ebe.plot_hypnograms(
        >>>     sleep_id=8, ax=ax, obs_kwargs={"color": "red", "lw": 2, "ls": "dotted"}
        >>> )
        >>> plt.tight_layout()

    .. plot::

        >>> session = 8
        >>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
        >>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
        >>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
        >>> legend_style = dict(
        >>>     title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
        >>> )
        >>> ax = ebe.plot_hypnograms(
        >>>     sleep_id=session, ref_kwargs=style_a, obs_kwargs=style_b, legend=legend_style, ax=ax
        >>> )
        >>> acc = ebe.get_agreement().multiply(100).at[session, "accuracy"]
        >>> ax.text(
        >>>     0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes
        >>> )

    When comparing only 2 hypnograms, use the :py:meth:`~yasa.Hynogram.evaluate` method:

    >>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
    >>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
    >>> ebe = hypno_a.evaluate(hypno_b)
    >>> ebe.get_confusion_matrix()
    RaterB  WAKE  N1  N2  N3
    RaterA
    WAKE      71   2  20   8
    N1         1   0   9   0
    N2        12   4  25   0
    N3        24   0   1   3
    """

[docs]    def __init__(self, ref_hyps, obs_hyps):
        from yasa.hypno import Hypnogram  # Avoiding circular import, bc hypno imports this class

        assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
        assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
        assert type(ref_hyps) is type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
        assert len(ref_hyps) == len(
            obs_hyps
        ), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"

        if isinstance(ref_hyps, dict):
            # If user provides dictionaries, split into sleep IDs and hypnograms
            assert (
                ref_hyps.keys() == obs_hyps.keys()
            ), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
            sleep_ids, ref_hyps = zip(*ref_hyps.items())
            obs_hyps = tuple(obs_hyps.values())
        else:
            # Create hypnogram_ids
            sleep_ids = tuple(range(1, 1 + len(ref_hyps)))

        assert all(
            isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
        ), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
        assert all(
            h.scorer is not None for h in ref_hyps + obs_hyps
        ), "all hypnograms in `ref_hyps` and `obs_hyps` must have a scorer name"
        for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
            assert h1.freq == h2.freq, "all hypnograms must have the same freq"
            assert h1.labels == h2.labels, "all hypnograms must have the same labels"
            assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
            assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
        assert all(
            h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
        ), "all `ref_hyps` must have the same scorer"
        assert all(
            h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
        ), "all `obs_hyps` must have the same scorer"
        assert all(
            h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
        ), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
        assert all(
            h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
        ), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
        # Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
        ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
        obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}

        # Merge all hypnograms into a single MultiIndexed dataframe
        ref = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items())
        obs = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items())
        data = pd.concat([ref, obs], axis=1)

        # Generate some mapping dictionaries to be used later in class methods
        skm_labels = np.unique(data).tolist()  # all unique YASA integer codes in this hypno
        skm2yasa_map = {i: l for i, l in enumerate(skm_labels)}  # skm order to YASA integers
        yasa2yasa_map = ref_hyps[sleep_ids[0]].mapping_int.copy()  # YASA integer to YASA string

        # Set attributes
        self._data = data
        self._sleep_ids = sleep_ids
        self._ref_hyps = ref_hyps
        self._obs_hyps = obs_hyps
        self._ref_scorer = ref_hyps[sleep_ids[0]].scorer
        self._obs_scorer = obs_hyps[sleep_ids[0]].scorer
        self._skm_labels = skm_labels
        self._skm2yasa_map = skm2yasa_map
        self._yasa2yasa_map = yasa2yasa_map

    def __repr__(self):
        # TODO v0.8: Keep only the text between < and >
        s = "s" if self.n_sleeps > 1 else ""
        return (
            f"<EpochByEpochAgreement | Observed hypnogram{s} scored by {self.obs_scorer} "
            f"evaluated against reference hypnogram{s} scored by {self.ref_scorer}, "
            f"{self.n_sleeps} sleep session{s}>\n"
            " - Use `.get_agreement()` to get agreement measures as a pandas DataFrame or Series\n"
            " - Use `.plot_hypnograms()` to plot two overlaid hypnograms\n"
            "See the online documentation for more details."
        )

    def __str__(self):
        return self.__repr__()

    @property
    def data(self):
        """A :py:class:`pandas.DataFrame` including all hypnograms."""
        return self._data

    @property
    def n_sleeps(self):
        """The number of unique sleep sessions."""
        return len(self._sleep_ids)

    @property
    def ref_scorer(self):
        """The name of the reference scorer."""
        return self._ref_scorer

    @property
    def obs_scorer(self):
        """The name of the observed scorer."""
        return self._obs_scorer

[docs]    @staticmethod
    def multi_scorer(df, scorers):
        """
        Compute multiple agreement scores from a 2-column dataframe (an optional 3rd column may
        contain sample weights).

        This function offers convenience when calculating multiple agreement scores using
        :py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
        returns multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
        accept multiple functions.

        Parameters
        ----------
        df : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` with 2 columns and length of *n_samples*.
            The first column contains reference values and second column contains observed values.
            If a third column, it must contain sample weights to be passed to underlying
            :py:mod:`sklearn.metrics` functions as ``sample_weight`` where applicable.
        scorers : dictionary
            The scorers to be used for evaluating agreement. A dictionary with scorer names (str) as
            keys and functions as values.

        Returns
        -------
        scores : dict
            A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
        """
        assert isinstance(df, pd.DataFrame), "`df` must be a pandas DataFrame"
        assert df.shape[1] in [2, 3], "`df` must have either 2 or 3 columns"
        assert isinstance(scorers, dict), "`scorers` must be a dictionary"
        assert all(
            isinstance(k, str) and callable(v) for k, v in scorers.items()
        ), "Each key of `scorers` must be a string, and each value must be a callable function"
        if df.shape[1] == 3:
            true, pred, weights = zip(*df.values)
        elif df.shape[1] == 2:
            true, pred = zip(*df.values)  # Same as (df["col1"], df["col2"]) but teensy bit faster
            weights = None
        scores = {s: f(true, pred, weights) for s, f in scorers.items()}
        return scores

[docs]    def get_agreement(self, sample_weight=None, scorers=None):
        """
        Return a :py:class:`pandas.DataFrame` of weighted (i.e., averaged) agreement scores.

        Parameters
        ----------
        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
        sample_weight : None or :py:class:`pandas.Series`
            Sample weights passed to underlying :py:mod:`sklearn.metrics` functions where possible.
            If a :py:class:`pandas.Series`, the index must match exactly that of
            :py:attr:`~yasa.Hypnogram.data`.
        scorers : None, list, or dictionary
            The scorers to be used for evaluating agreement. If None (default), default scorers are
            used. If a list, the list must contain strings that represent metrics from the sklearn
            metrics module (e.g., ``accuracy``, ``precision``). If more customization is desired, a
            dictionary can be passed with scorer names (str) as keys and custom functions as values.
            The custom functions should take 3 positional arguments (true values, predicted values,
            and sample weights).

        Returns
        -------
        agreement : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
        """
        assert isinstance(
            sample_weight, (type(None), pd.Series)
        ), "`sample_weight` must be None or pandas Series"
        assert isinstance(scorers, (type(None), list, dict))
        if isinstance(scorers, list):
            assert all(isinstance(x, str) for x in scorers)
        elif isinstance(scorers, dict):
            assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
        if scorers is None:
            # Create dictionary of default scorer functions
            scorers = {
                "accuracy": lambda t, p, w: skm.accuracy_score(
                    t, p, normalize=True, sample_weight=w
                ),
                "balanced_acc": lambda t, p, w: skm.balanced_accuracy_score(
                    t, p, adjusted=False, sample_weight=w
                ),
                "kappa": lambda t, p, w: skm.cohen_kappa_score(
                    t, p, labels=None, weights=None, sample_weight=w
                ),
                "mcc": lambda t, p, w: skm.matthews_corrcoef(t, p, sample_weight=w),
                "precision": lambda t, p, w: skm.precision_score(
                    t, p, average="weighted", sample_weight=w, zero_division=0
                ),
                "recall": lambda t, p, w: skm.recall_score(
                    t, p, average="weighted", sample_weight=w, zero_division=0
                ),
                "f1": lambda t, p, w: skm.f1_score(
                    t, p, average="weighted", sample_weight=w, zero_division=0
                ),
            }
        elif isinstance(scorers, list):
            # Convert the list to a dictionary of sklearn scorers
            scorers = {s: skm.__getattribute__(f"{s}_scorer") for s in scorers}
        # Make a copy of data since weights series might be added to it
        df = self.data.copy()
        if sample_weight is not None:
            assert sample_weight.index == self.data.index, (
                "If not `None`, `sample_weight` Series must be a pandas Series with the same index "
                "as `self.data`"
            )
            # Add weights as a third column for multi_scorer to use
            df["weights"] = sample_weight
        # Get individual-level averaged/weighted agreement scores
        agreement = df.groupby(level=0).apply(self.multi_scorer, scorers=scorers).apply(pd.Series)
        # Set attribute for later access
        self._agreement = agreement
        # Convert to Series if just one session being evaluated
        if self.n_sleeps == 1:
            agreement = agreement.squeeze().rename("agreement")
        return agreement

[docs]    def get_agreement_bystage(self, beta=1.0):
        """
        Return a :py:class:`pandas.DataFrame` of unweighted (i.e., one-vs-rest) agreement scores.

        Parameters
        ----------
        self : :py:class:`~yasa.evaluation.EpochByEvaluation`
            A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
        beta : float
            See :py:func:`sklearn.metrics.precision_recall_fscore_support`.

        Returns
        -------
        agreement : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
            :py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
        """

        def scorer(df):
            return skm.precision_recall_fscore_support(
                *df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
            )

        agreement = (
            self.data
            # Get precision, recall, f1, and support for each individual sleep session
            .groupby(level=0)
            .apply(scorer)
            # Unpack arrays
            .explode()
            .apply(pd.Series)
            # Add metric labels column and prepend it to index, creating MultiIndex
            .assign(metric=["precision", "recall", "fbeta", "support"] * self.n_sleeps)
            .set_index("metric", append=True)
            # Convert stage column names to string labels
            .rename_axis(columns="stage")
            .rename(columns=self._skm2yasa_map)
            .rename(columns=self._yasa2yasa_map)
            # Remove all-zero columns (i.e., stages that were not present in the hypnogram)
            .pipe(lambda df: df.loc[:, df.any()])
            # Reshape so metrics are columns
            .stack()
            .unstack("metric")
            .rename_axis(columns=None)
            # Swap MultiIndex levels and sort so stages are in standard YASA order
            .swaplevel()
            .sort_index(
                level="stage",
                key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y)),
            )
        )
        # Set attribute for later access
        self._agreement_bystage = agreement
        # Remove the MultiIndex if just one session being evaluated
        if self.n_sleeps == 1:
            agreement = agreement.reset_index(level=1, drop=True)
        return agreement

[docs]    def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
        """
        Return a ``ref_hyp``/``obs_hyp``confusion matrix from either a single session or all
        sessions concatenated together.

        Parameters
        ----------
        self : :py:class:`yasa.EpochByEpochAgreement`
            A :py:class:`yasa.EpochByEpochAgreement` instance.
        sleep_id : None or a valid sleep ID
            If None (default), cross-tabulation is derived from the entire group dataset.
            If a valid sleep ID, cross-tabulation is derived using only the reference and observed
            scored hypnograms from that sleep session.
        agg_func : None or str
            If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
            all individual session results. If not None, group results returns a
            :py:class:`~pandas.DataFrame` aggregated across sessions where ``agg_func`` is passed as
            ``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`. For example, set
            ``agg_func="sum"`` to get a single confusion matrix across all epochs that does not take
            session into account.
        **kwargs : key, value pairs
            Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.

        Returns
        -------
        conf_matr : :py:class:`pandas.DataFrame`
            A confusion matrix with stages from the reference scorer as indices and stages from the
            test scorer as columns.

        Examples
        --------
        >>> import yasa
        >>> ref_hyps = [yasa.simulate_hypnogram(tib=90, scorer="Rater1", seed=i) for i in range(3)]
        >>> obs_hyps = [h.simulate_similar(scorer="Rater2", seed=i) for i, h in enumerate(ref_hyps)]
        >>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
        >>> ebe.get_confusion_matrix(sleep_id=2)
        Rater2  WAKE  N1  N2  N3  REM
        Rater1
        WAKE       1   2  23   0    0
        N1         0   9  13   0    0
        N2         0   6  71   0    0
        N3         0  13  42   0    0
        REM        0   0   0   0    0

        >>> ebe.get_confusion_matrix()
        Rater2           WAKE  N1  N2  N3  REM
        sleep_id Rater1
        1        WAKE      30   0   3   0   35
                 N1         3   2   7   0    0
                 N2        21  12   7   0    4
                 N3         0   0   0   0    0
                 REM        2   8  29   0   17
        2        WAKE       1   2  23   0    0
                 N1         0   9  13   0    0
                 N2         0   6  71   0    0
                 N3         0  13  42   0    0
                 REM        0   0   0   0    0
        3        WAKE      16   0   7  19   19
                 N1         0   7   2   0    5
                 N2         0  10  12   7    5
                 N3         0   0  16  11    0
                 REM        0  15  11  18    0

        >>> ebe.get_confusion_matrix(agg_func="sum")
        Rater2  WAKE  N1  N2  N3  REM
        Rater1
        WAKE      47   2  33  19   54
        N1         3  18  22   0    5
        N2        21  28  90   7    9
        N3         0  13  58  11    0
        REM        2  23  40  18   17
        """
        assert (
            sleep_id is None or sleep_id in self._sleep_ids
        ), "`sleep_id` must be None or a valid sleep ID"
        assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
        assert not (
            (self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None
        ), "`agg_func` must be None if plotting a single session."
        kwargs = {"labels": self._skm_labels} | kwargs
        # Generate a DataFrame with a confusion matrix for each session
        #   Seems easier to just generate this whole thing and then either
        #   extract a single one or aggregate across them all, depending on user request
        confusion_matrices = (
            self.data
            # Get confusion matrix for each individual sleep session
            .groupby(level=0)
            .apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
            # Expand results matrix out from single cell
            .explode()
            .apply(pd.Series)
            # Convert to MultiIndex with reference scorer as new level
            .assign(**{self.ref_scorer: self._skm_labels * self.n_sleeps})
            .set_index(self.ref_scorer, append=True)
            .rename_axis(columns=self.obs_scorer)
            # Convert sleep stage columns and indices to strings
            .rename(columns=self._skm2yasa_map)
            .rename(columns=self._yasa2yasa_map)
            .rename(index=self._skm2yasa_map, level=self.ref_scorer)
            .rename(index=self._yasa2yasa_map, level=self.ref_scorer)
        )
        if self.n_sleeps == 1:
            # If just one session, use the only session ID as the key, for simplified returned df
            sleep_id = self._sleep_ids[0]
        if sleep_id is None:
            if agg_func is None:
                mat = confusion_matrices
            else:
                mat = confusion_matrices.groupby(self.ref_scorer, sort=False).agg(agg_func)
        else:
            mat = confusion_matrices.loc[sleep_id]
        return mat

[docs]    def get_sleep_stats(self):
        """
        Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
        both reference and observed scorers.

        .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`

        .. seealso:: :py:class:`yasa.SleepStatsAgreement`

        Parameters
        ----------
        self : :py:class:`yasa.EpochByEpochAgreement`
            A :py:class:`yasa.EpochByEpochAgreement` instance.

        Returns
        -------
        sstats : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
            individual (one for reference scorer and another for test scorer).
        """
        # Get all sleep statistics
        ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
        obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
        # Reshape and name axis
        ref_sstats = ref_sstats.T.rename_axis("sleep_id")
        obs_sstats = obs_sstats.T.rename_axis("sleep_id")
        # Convert to MultiIndex with new scorer level
        ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
        obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
        # Concatenate into one DataFrame
        sstats = pd.concat([ref_sstats, obs_sstats])
        # Remove the MultiIndex if just one session being evaluated
        if self.n_sleeps == 1:
            sstats = sstats.reset_index(level=1, drop=True)
        return sstats

[docs]    def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, obs_kwargs={}):
        """Plot the two hypnograms of one session overlapping on the same axis.

        .. seealso:: :py:func:`yasa.plot_hypnogram`

        Parameters
        ----------
        self : :py:class:`yasa.EpochByEpochAgreement`
            A :py:class:`yasa.EpochByEpochAgreement` instance.
        sleep_id : a valid sleep ID or None
            The sleep session to plot. If multiple sessions are included in the
            :py:class:`~yasa.EpochByEpochAgreement` instance, a ``sleep_id`` must be provided. If
            only one session is present, ``None`` (default) will plot the two hypnograms of the
            only session.
        legend : bool or dict
            If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
            pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
        ax : :py:class:`matplotlib.axes.Axes` or None
            Axis on which to draw the plot, optional.
        ref_kwargs : dict
            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
            hypnogram.
        obs_kwargs : dict
            Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the observed
            hypnogram.

        Returns
        -------
        ax : :py:class:`matplotlib.axes.Axes`
            Matplotlib Axes

        Examples
        --------
        .. plot::

            >>> from yasa import simulate_hypnogram
            >>> hyp = simulate_hypnogram(scorer="Anthony", seed=19)
            >>> ax = hyp.evaluate(hyp.simulate_similar(scorer="Alan", seed=68)).plot_hypnograms()
        """
        assert (
            sleep_id is None or sleep_id in self._sleep_ids
        ), "`sleep_id` must be None or a valid sleep ID"
        assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
        assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
        assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
        assert (
            "ax" not in ref_kwargs | obs_kwargs
        ), "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
        assert not (sleep_id is None and self.n_sleeps > 1), (
            "Multi-session plotting is not currently supported. `sleep_id` must not be None when "
            "multiple sessions are present"
        )
        # Select the session hypnograms to plot
        if sleep_id is None and self.n_sleeps == 1:
            ref_hyp = self._ref_hyps[self._sleep_ids[0]]
            obs_hyp = self._obs_hyps[self._sleep_ids[0]]
        else:
            ref_hyp = self._ref_hyps[sleep_id]
            obs_hyp = self._obs_hyps[sleep_id]
        # Set default plotting kwargs and merge with user kwargs
        plot_ref_kwargs = {
            "label": self.ref_scorer,
            "highlight": None,
            "color": "black",
            "alpha": 0.8,
        }
        plot_obs_kwargs = {
            "label": self.obs_scorer,
            "highlight": None,
            "color": "green",
            "alpha": 0.8,
            "ls": "dashed",
        }
        plot_ref_kwargs.update(ref_kwargs)
        plot_obs_kwargs.update(obs_kwargs)
        # Draw the hypnograms
        ax = ref_hyp.plot_hypnogram(ax=ax, **plot_ref_kwargs)
        ax = obs_hyp.plot_hypnogram(ax=ax, **plot_obs_kwargs)
        # Add legend if desired
        if legend:
            if isinstance(legend, dict):
                ax.legend(**legend)
            else:
                ax.legend()
        return ax

[docs]    def summary(self, by_stage=False, **kwargs):
        """Return group-level agreement scores.

        Default aggregated measures are

        Parameters
        ----------
        self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
            A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
        by_stage : bool
            If ``False`` (default), ``summary`` will include agreement scores derived from
            average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
            will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
        **kwargs : key, value pairs
            Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
            This can be used to customize the descriptive statistics returned.

        Returns
        -------
        summary : :py:class:`pandas.DataFrame`
            A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
            with descriptive statistics.

            >>> ebe = yasa.EpochByEpochAgreement(...)
            >>> agreement = ebe.get_agreement()
            >>> ebe.summary()

            This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
            each column is a descriptive statistic (e.g., mean, standard deviation).
            To control the descriptive statistics included as columns:

            >>> ebe.summary(func=["count", "mean", "sem"])
        """
        assert self.n_sleeps > 1, "Summary scores can not be computed with only one hypnogram pair."
        assert isinstance(by_stage, bool), "`by_stage` must be True or False"
        if by_stage:
            assert hasattr(
                self, "_agreement_bystage"
            ), "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
        else:
            assert hasattr(
                self, "_agreement"
            ), "Must run `self.get_agreement` before obtaining summary results."

        # Create a function for getting mean absolute deviation
        def mad(df):
            return (df - df.mean()).abs().mean()

        # Merge default and user kwargs
        agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
        if by_stage:
            summary = (
                self.agreement_bystage.groupby("stage")
                .agg(**agg_kwargs)
                .stack(level=0)
                .rename_axis(["stage", "metric"])
            )
        else:
            summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
        return summary


################################################################################
# SLEEP STATISTICS
################################################################################


[docs]class SleepStatsAgreement:
    """
    Evaluate agreement between sleep statistics reported by two different scorers.
    Evaluation includes bias and limits of agreement (as well as both their confidence intervals),
    various plotting options, and calibration functions for correcting biased values from the
    observed scorer.

    Features include:
    * Get summary calculations of bias, limits of agreement, and their confidence intervals.
    * Test statistical assumptions of bias, limits of agreement, and their confidence intervals,
    and apply corrective procedures when the assumptions are not met.
    * Get bias and limits of agreement in a string-formatted table.
    * Calibrate new data to correct for biases in observed data.
    * Return individual calibration functions.
    * Visualize discrepancies for outlier inspection.
    * Visualize Bland-Altman plots.

    .. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`

    .. versionadded:: 0.7.0

    Parameters
    ----------
    ref_data : :py:class:`pandas.DataFrame`
        A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
        Rows are unique observations and columns are unique sleep statistics.
    obs_data : :py:class:`pandas.DataFrame`
        A :py:class:`pandas.DataFrame` with sleep statistics from the observed scorer.
        Rows are unique observations and columns are unique sleep statistics.
        Shape, index, and columns must be identical to ``ref_data``.
    ref_scorer : str
        Name of the reference scorer.
    obs_scorer : str
        Name of the observed scorer.
    agreement : float
        Multiple of the standard deviation to plot agreement limits. The default is 1.96, which
        corresponds to a 95% confidence interval if the differences are normally distributed.

        .. note:: ``agreement`` gets adjusted for regression-modeled limits of agreement.
    confidence : float
        The percentage confidence interval for the confidence intervals that are applied to bias and
        limits of agreement. The same confidence interval percentage is applied to both standard and
        bootstrapped confidence intervals.
    alpha : float
        Alpha cutoff used for all assumption tests.
    verbose : bool or str
        Verbose level. Default (False) will only print warning and error messages. The logging
        levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
        between 'info' (or ``verbose=True``) and warning (``verbose=False``).

    Notes
    -----
    Sleep statistics that are identical between scorers are removed from analysis.

    Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
    See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html

    References
    ----------
    .. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
                      (2021). A standardized framework for testing the performance of sleep-tracking
                      technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
                      zsaa170. https://doi.org/10.1093/sleep/zsaa170

    Examples
    --------
    >>> import pandas as pd
    >>> import yasa
    >>>
    >>> # Generate fake reference and observed datasets with similar sleep statistics
    >>> ref_scorer = "Henri"
    >>> obs_scorer = "Piéron"
    >>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer=ref_scorer, seed=i) for i in range(20)]
    >>> obs_hyps = [h.simulate_similar(scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
    >>> # Generate sleep statistics from hypnograms using EpochByEpochAgreement
    >>> eea = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
    >>> sstats = eea.get_sleep_stats()
    >>> ref_sstats = sstats.loc[ref_scorer]
    >>> obs_sstats = sstats.loc[obs_scorer]
    >>> # Create SleepStatsAgreement instance
    >>> ssa = yasa.SleepStatsAgreement(ref_sstats, obs_sstats)
    >>> ssa.summary().round(1).head(3)
    variable   bias_intercept             ...   uloa_parm
    interval           center lower upper ...      center lower upper
    sleep_stat                            ...
    %N1                  -5.4 -13.9   3.2 ...         6.1   3.7   8.5
    %N2                 -27.3 -49.1  -5.6 ...        12.4   7.2  17.6
    %N3                  -9.1 -23.8   5.5 ...        20.4  12.6  28.3

    >>> ssa.get_table().head(3)[["bias", "loa"]]
                          bias                            loa
    sleep_stat
    %N1                   0.25  Bias ± 2.46 * (-0.00 + 1.00x)
    %N2         -27.34 + 0.55x   Bias ± 2.46 * (0.00 + 1.00x)
    %N3                   1.38   Bias ± 2.46 * (0.00 + 1.00x)

    >>> ssa.assumptions.head(3)
                unbiased  normal  constant_bias  homoscedastic
    sleep_stat
    %N1             True    True           True          False
    %N2             True    True          False          False
    %N3             True    True           True          False

    >>> ssa.auto_methods.head(3)
                bias   loa    ci
    sleep_stat
    %N1         parm  regr  parm
    %N2         regr  regr  parm
    %N3         parm  regr  parm

    >>> ssa.get_table(bias_method="parm", loa_method="parm").head(3)[["bias", "loa"]]
                 bias            loa
    sleep_stat
    %N1          0.25    -5.55, 6.06
    %N2         -0.23  -12.87, 12.40
    %N3          1.38  -17.67, 20.44

    >>> new_hyps = [h.simulate_similar(scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
    >>> new_sstats = pd.Series(new_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
    >>> new_sstats = new_sstats[["N1", "TST", "WASO"]]
    >>> new_sstats.round(1).head(5)
         N1    TST   WASO
    0  42.5  439.5  147.5
    1  84.0  550.0   38.5
    2  53.5  489.0  103.0
    3  57.0  469.5  120.0
    4  71.0  531.0   69.0

    >>> new_stats_calibrated = ssa.calibrate_stats(new_sstats, bias_method="auto")
    >>> new_stats_calibrated.round(1).head(5)
         N1    TST   WASO
    0  42.9  433.8  150.0
    1  84.4  544.2   41.0
    2  53.9  483.2  105.5
    3  57.4  463.8  122.5
    4  71.4  525.2   71.5

    .. plot::

        >>> import matplotlib.pyplot as plt
        >>> ax = ssa.plot_discrepancies_heatmap()
        >>> ax.set_title("Sleep statistic discrepancies")
        >>> plt.tight_layout()

    .. plot::

        >>> ssa.plot_blandaltman()
    """

[docs]    def __init__(
        self,
        ref_data,
        obs_data,
        *,
        ref_scorer="Reference",
        obs_scorer="Observed",
        agreement=1.96,
        confidence=0.95,
        alpha=0.05,
        verbose=True,
        bootstrap_kwargs={},
    ):

        restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]

        assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
        assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
        assert np.array_equal(
            ref_data.index, obs_data.index
        ), "`ref_data` and `obs_data` index values must be identical"
        assert (
            ref_data.index.name == obs_data.index.name
        ), "`ref_data` and `obs_data` index names must be identical"
        assert np.array_equal(
            ref_data.columns, obs_data.columns
        ), "`ref_data` and `obs_data` column values must be identical"
        assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
        assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
        assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
        assert (
            isinstance(agreement, (float, int)) and agreement > 0
        ), "`agreement` must be a number greater than 0"
        assert (
            isinstance(confidence, (float, int)) and 0 < alpha < 1
        ), "`confidence` must be a number between 0 and 1"
        assert (
            isinstance(alpha, (float, int)) and 0 <= alpha <= 1
        ), "`alpha` must be a number between 0 and 1 inclusive"
        assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
        assert all(
            k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs
        ), f"None of {restricted_bootstrap_kwargs} can be set by the user"

        # If `ref_data` and `obs_data` indices are unnamed, name them
        session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
        ref_data.index.name = obs_data.index.name = session_key

        # Reshape to long format DataFrame with 2 columns (observed, reference) and MultiIndex
        data = (
            pd.concat([obs_data, ref_data], keys=[obs_scorer, ref_scorer], names=["scorer"])
            .melt(var_name="sleep_stat", ignore_index=False)
            .pivot_table(index=["sleep_stat", session_key], columns="scorer", values="value")
            .rename_axis(columns=None)
            .sort_index()
        )

        # Get scorer differences (i.e., observed minus reference)
        data["difference"] = data[obs_scorer] - data[ref_scorer]

        # Remove sleep statistics that have no differences between scorers
        stats_rm = data.groupby("sleep_stat")["difference"].any().loc[lambda x: ~x].index.tolist()
        data = data.drop(labels=stats_rm)
        for s in stats_rm:
            logger.warning(f"Removed {s} from evaluation because all scorings were identical.")

        # Create grouper and n_sessions variables for convenience
        grouper = data.groupby("sleep_stat")
        n_sessions = data.index.get_level_values(session_key).nunique()

        ########################################################################
        # Generate parametric Bias and LoA for all sleep stats
        ########################################################################
        # Parametric Bias
        parm_vals = grouper["difference"].mean().to_frame("bias_parm")
        # Parametric LoA
        parm_vals["lloa_parm"], parm_vals["uloa_parm"] = zip(
            *grouper["difference"].apply(self._arr_to_loa, agreement=agreement)
        )

        ########################################################################
        # Generate standard CIs for parametric Bias and LoA for all sleep stats
        ########################################################################
        # Get critical t and standard error used to calculate parametric CIs for parametric Bias/LoA
        t_parm = sps.t.ppf((1 + confidence) / 2, n_sessions - 1)
        sem = grouper["difference"].sem(ddof=1)
        # Parametric CIs for parametric Bias and LoA
        parm_ci = pd.DataFrame(
            {
                "bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
                "bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
                "lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
                "lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
                "uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
                "uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
            }
        )

        ########################################################################
        # Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
        ########################################################################
        # Run regression used to (a) model bias and (b) test for proportional/constant bias
        bias_regr = grouper[[ref_scorer, "difference"]].apply(self._linregr_dict).apply(pd.Series)
        # Get absolute residuals from this regression bc they are used in the next regression
        idx = data.index.get_level_values("sleep_stat")
        slopes = bias_regr.loc[idx, "slope"].to_numpy()
        intercepts = bias_regr.loc[idx, "intercept"].to_numpy()
        predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
        data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
        data["residuals_abs"] = data["residuals"].abs()
        # Run regression used to (a) model LoA and (b) test for heteroscedasticity/homoscedasticity
        loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._linregr_dict).apply(pd.Series)
        # Stack the two regression dataframes together
        regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)

        ########################################################################
        # Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
        ########################################################################
        # Get critical t used used to calculate parametric CIs for regression Bias/LoA
        t_regr = sps.t.ppf((1 + confidence) / 2, n_sessions - 2)  # dof=n-2 for regression
        # Parametric CIs for modeled Bias and LoA
        regr_ci = pd.DataFrame(
            {
                "intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
                "intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
                "slope-lower": regr["slope"] - regr["stderr"] * t_regr,
                "slope-upper": regr["slope"] + regr["stderr"] * t_regr,
            }
        )

        ########################################################################
        # Test all statistical assumptions
        ########################################################################
        assumptions = pd.DataFrame(
            {
                "unbiased": (
                    grouper["difference"].apply(lambda a: sps.ttest_1samp(a, 0).pvalue).ge(alpha)
                ),
                "normal": grouper["difference"].apply(lambda a: sps.shapiro(a).pvalue).ge(alpha),
                "constant_bias": bias_regr["pvalue"].ge(alpha),
                "homoscedastic": loa_regr["pvalue"].ge(alpha),
            }
        )

        ########################################################################
        # Setting attributes
        ########################################################################

        # Merge the parametric and regression values for Bias and LoA
        regr_vals = regr.unstack(0)[["slope", "intercept"]]
        regr_vals.columns = regr_vals.columns.swaplevel().map("_".join)
        vals = parm_vals.join(regr_vals).rename_axis("variable", axis=1)

        # Merge the two CI dataframes for easier access
        regr_ci = regr_ci.unstack(0)
        regr_ci.columns = regr_ci.columns.swaplevel().map("_".join)
        ci = parm_ci.join(regr_ci)
        ci.columns = pd.MultiIndex.from_tuples(
            tuples=ci.columns.str.split("-", expand=True),
            names=["variable", "interval"],
        )
        empty_df = pd.DataFrame().reindex_like(ci)
        ci = pd.concat({"parm": ci, "boot": empty_df}, names=["ci_method"], axis=1)
        ci = ci.sort_index(axis=1)  # Sort MultiIndex columns for cleanliness

        # Set attributes
        self._agreement = agreement
        self._confidence = confidence
        self._bootstrap_kwargs = bootstrap_kwargs
        self._n_sessions = n_sessions
        self._ref_scorer = ref_scorer
        self._obs_scorer = obs_scorer
        self._data = data
        self._assumptions = assumptions
        self._regr = regr
        self._vals = vals
        self._ci = ci
        self._bias_method_opts = ["parm", "regr", "auto"]
        self._loa_method_opts = ["parm", "regr", "auto"]
        self._ci_method_opts = ["parm", "boot", "auto"]

    @property
    def ref_scorer(self):
        """The name of the reference scorer."""
        return self._ref_scorer

    @property
    def obs_scorer(self):
        """The name of the observed scorer."""
        return self._obs_scorer

    @property
    def n_sessions(self):
        """The number of sessions."""
        return self._n_sessions

    @property
    def data(self):
        """A long-format :py:class:`pandas.DataFrame` containing all raw sleep statistics from
        ``ref_data`` and ``obs_data``.
        """
        return self._data.drop(columns=["difference", "residuals", "residuals_abs"])

    @property
    def sleep_statistics(self):
        """Return a list of all sleep statistics included in the agreement analyses."""
        return self.data.index.get_level_values("sleep_stat").unique().to_list()

    @property
    def assumptions(self):
        """A :py:class:`pandas.DataFrame` containing boolean values indicating the pass/fail status
        of all statistical tests performed to test assumptions.
        """
        return self._assumptions

    @property
    def auto_methods(self):
        """
        A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
        """
        return pd.concat(
            [
                self.assumptions["constant_bias"].map({True: "parm", False: "regr"}).rename("bias"),
                self.assumptions["homoscedastic"].map({True: "parm", False: "regr"}).rename("loa"),
                self.assumptions["normal"].map({True: "parm", False: "boot"}).rename("ci"),
            ],
            axis=1,
        )

    def __repr__(self):
        # TODO v0.8: Keep only the text between < and >
        return (
            f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
            f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
            " - Use `.summary()` to get a dataframe of bias and limits of agreement for each sleep "
            "statistic\n"
            " - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
            "See the online documentation for more details."
        )

    def __str__(self):
        return self.__repr__()

    ############################################################################
    # Define some utility functions, mostly to aid with the use of df.apply and stats.bootstrap
    ############################################################################

    @staticmethod
    def _arr_to_loa(x, agreement):
        """Return a tuple with lower and upper limits of agreement."""
        mean = np.mean(x)
        bound = agreement * np.std(x, ddof=1)
        return mean - bound, mean + bound

    @staticmethod
    def _linregr_dict(*args, **kwargs):
        """
        A wrapper around :py:func:`scipy.stats.linregress` that returns a dictionary instead of a
        named tuple. In the normally returned object, ``intercept_stderr`` is an extra field that is
        not included when converting the named tuple, so this allows it to be included when using
        something like groupby.
        """
        regr = sps.linregress(*args, **kwargs)
        return {
            "slope": regr.slope,
            "intercept": regr.intercept,
            "rvalue": regr.rvalue,
            "pvalue": regr.pvalue,
            "stderr": regr.stderr,
            "intercept_stderr": regr.intercept_stderr,
        }

    def _generate_bootstrap_ci(self, sleep_stats):
        """
        Internal method to generate bootstrapped confidence intervals for bias and LoA.
        This operates in-place by concatenating bootstrapped CIs to existing parametric CIs.
        Note that parametric CIs are generated by default during init (bc they are quicker).

        Parameters
        ----------
        sleep_stats : list
            A list of sleep statistics to bootstrap confidence intervals for.
        """
        assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
        assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
        assert all(
            isinstance(ss, str) for ss in sleep_stats
        ), "all elements of `sleep_stats` must be strings"
        assert all(
            ss in self.sleep_statistics for ss in sleep_stats
        ), f"all elements of `sleep_stats` must be one of {self.sleep_statistics}"
        # Update bootstrap keyword arguments with defaults
        bs_kwargs = {
            "n_resamples": 1000,
            "method": "BCa",
            "confidence_level": self._confidence,  # should not change from parametric level
            "vectorized": False,  # should stay False, bc of how the custom get_vars function works
            "paired": True,  # should stay True, especially if method is BCa
        } | self._bootstrap_kwargs

        def get_vars(ref_arr, diff_arr, rabs_arr):
            """A function to get all variables at once and avoid redundant stats.bootstrap calls."""
            bias_parm = np.mean(diff_arr)
            lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
            bias_slope, bias_inter = sps.linregress(ref_arr, diff_arr)[:2]
            # Note this is NOT recalculating residuals each time for the next regression
            loa_slope, loa_inter = sps.linregress(ref_arr, rabs_arr)[:2]
            return bias_parm, lloa_parm, uloa_parm, bias_inter, bias_slope, loa_inter, loa_slope

        # !! Column order MUST match the order of arrays boot_stats expects as INPUT
        # !! Variable order MUST match the order of floats boot_stats returns as OUTPUT
        interval_order = ["lower", "upper"]
        column_order = ["Reference", "difference", "residuals_abs"]
        variable_order = [
            "bias_parm",
            "lloa_parm",
            "uloa_parm",
            "bias_intercept",
            "bias_slope",
            "loa_intercept",
            "loa_slope",
        ]
        boot_ci = (
            self._data.loc[
                sleep_stats, column_order
            ]  # Extract the relevant sleep stats and columns
            .groupby("sleep_stat")  # Group so the bootstrapping is applied once to each sleep stat
            # Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
            # of the passed dataframe to a tuple of 3 1D arrays
            .apply(lambda df: sps.bootstrap(tuple(df.to_numpy().T), get_vars, **bs_kwargs))
            .map(lambda res: res.confidence_interval)  # Pull high/low CIs out of the results object
            .explode()  # Break high and low CIs into separate rows
            .to_frame("value")  # Convert to dataframe and name column
            .assign(interval=interval_order * len(sleep_stats))  # Add a column indicating interval
            .explode("value")  # Break low CI variables and high CI variables out of arrays
            .assign(variable=variable_order * len(sleep_stats) * 2)  # Add column indicating variabl
            .pivot(columns=["variable", "interval"], values="value")  # Go long to wide format
            .sort_index(axis=1)  # Sort MultiIndex columns for cleanliness
        )
        # Merge with existing CI dataframe
        self._ci["boot"] = self._ci["boot"].fillna(boot_ci)

[docs]    def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fstrings={}):
        """
        Return a :py:class:`~pandas.DataFrame` with bias, loa, bias_ci, loa_ci as string equations.

        Parameters
        ----------
        bias_method : str
            If ``'parm'`` (i.e., parametric), bias is always represented as the mean difference
            (observed minus reference).
            If ``'regr'`` (i.e., regression), bias is always represented as a regression equation.
            If ``'auto'`` (default), bias is represented as a regression equation for sleep
            statistics where the score differences are proportionally biased and as the mean
            difference otherwise.
        loa_method : str
            If ``'parm'`` (i.e., parametric), limits of agreement are always represented as
            bias +/- 1.96 standard deviations (where 1.96 can be adjusted through the ``agreement``
            parameter).
            If ``'regr'`` (i.e., regression), limits of agreement are always represented as a
            regression equation.
            If ``'auto'`` (default), limits of agreement are represented as a regression equation
            for sleep statistics where the score differences are proportionally biased and as
            bias +/- 1.96 standard deviation otherwise.
        ci_method : str
            If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
            standard t-distribution.
            If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
            bootstrap resampling procedure.
            If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
            resampling procedure for sleep statistics where the distribution of score differences is
            non-normal and using a standard t-distribution otherwise.
        fstrings : dict
            Optional custom strings for formatting cells.

        Returns
        -------
        table : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
            and their confidence intervals for all sleep statistics.
        """
        assert isinstance(bias_method, str), "`bias_method` must be a string"
        assert (
            bias_method in self._bias_method_opts
        ), f"`bias_method` must be one of {self._bias_method_opts}"
        assert isinstance(loa_method, str), "`loa_method` must be a string"
        assert (
            loa_method in self._loa_method_opts
        ), f"`loa_method` must be one of {self._loa_method_opts}"
        assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
        # Agreement gets adjusted when LoA is modeled
        loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)
        if not fstrings:
            fstrings = {
                "bias_parm": "{bias_parm_center:.2f}",
                "bias_regr": "{bias_intercept_center:.2f} + {bias_slope_center:.2f}x",
                "loa_parm": "{lloa_parm_center:.2f}, {uloa_parm_center:.2f}",
                "loa_regr": (
                    "Bias \u00B1 {loa_regr_agreement:.2f} "
                    "* ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)"
                ),
                "bias_parm_ci": ("[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"),
                "bias_regr_ci": (
                    "[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], "
                    "[{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
                ),
                "loa_parm_ci": (
                    "[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], "
                    "[{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
                ),
                "loa_regr_ci": (
                    "[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], "
                    "[{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
                ),
            }
        values = self.summary(ci_method=ci_method)
        values.columns = values.columns.map("_".join)  # Convert MultiIndex columns to Index
        # Add a column of regr agreement so it can be used as variable
        values["loa_regr_agreement"] = loa_regr_agreement

        def format_all_str(row, fstrings_dict):
            return {var: fstr.format(**row) for var, fstr in fstrings_dict.items()}

        all_strings = values.apply(format_all_str, fstrings_dict=fstrings, axis=1).apply(pd.Series)
        if bias_method == "auto":
            bias_parm_idx = self.auto_methods.query("bias == 'parm'").index.tolist()
        elif bias_method == "parm":
            bias_parm_idx = self.sleep_statistics
        elif bias_method == "regr":
            bias_parm_idx = []
        if loa_method == "auto":
            loa_parm_idx = self.auto_methods.query("loa == 'parm'").index.tolist()
        elif loa_method == "parm":
            loa_parm_idx = self.sleep_statistics
        elif loa_method == "regr":
            loa_parm_idx = []
        bias_regr_idx = [ss for ss in self.sleep_statistics if ss not in bias_parm_idx]
        loa_regr_idx = [ss for ss in self.sleep_statistics if ss not in loa_parm_idx]
        bias_parm = all_strings.loc[bias_parm_idx, ["bias_parm", "bias_parm_ci"]]
        bias_regr = all_strings.loc[bias_regr_idx, ["bias_regr", "bias_regr_ci"]]
        bias_parm.columns = bias_parm.columns.str.replace("_parm", "")
        bias_regr.columns = bias_parm.columns.str.replace("_regr", "")
        bias = pd.concat([bias_parm, bias_regr])
        loa_parm = all_strings.loc[loa_parm_idx, ["loa_parm", "loa_parm_ci"]]
        loa_regr = all_strings.loc[loa_regr_idx, ["loa_regr", "loa_regr_ci"]]
        loa_parm.columns = loa_parm.columns.str.replace("_parm", "")
        loa_regr.columns = loa_regr.columns.str.replace("_regr", "")
        loa = pd.concat([loa_parm, loa_regr])
        table = bias.join(loa, validate="1:1").sort_index(axis=0)
        return table

[docs]    def summary(self, ci_method="auto"):
        """
        Return a :py:class:`~pandas.DataFrame` that includes all calculated metrics:
        * Parametric bias
        * Parametric lower and upper limits of agreement
        * Regression intercept and slope for modeled bias
        * Regression intercept and slope for modeled limits of agreement
        * Lower and upper confidence intervals for all metrics

        Parameters
        ----------
        ci_method : str
            If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
            standard t-distribution.
            If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
            bootstrap resampling procedure.
            If  ``'auto'`` (default), confidence intervals are represented using a bootstrap
            resampling procedure for sleep statistics where the distribution of score differences is
            non-normal and using a standard t-distribution otherwise.

        Returns
        -------
        summary : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
            and their confidence intervals for all sleep statistics.
        """
        assert isinstance(ci_method, str), "`ci_method` must be a string"
        assert ci_method in self._ci_method_opts, f"`ci_method` must be in {self._ci_method_opts}"
        # Make sure relevant sleep statistics have bootstrapped CIs, and generate them if not
        if ci_method in ["boot", "auto"]:
            if ci_method == "boot":
                sleep_stats_to_boot = self.sleep_statistics
            elif ci_method == "auto":
                sleep_stats_to_boot = self.auto_methods.query("ci == 'boot'").index.tolist()
            # Remove any sleep stats already bootstrapped CIs (eg if "boot" is callaed after "auto")
            sleep_stats_booted = self._ci["boot"].dropna().index
            sleep_stats_to_boot = [s for s in sleep_stats_to_boot if s not in sleep_stats_booted]
            if sleep_stats_to_boot:
                self._generate_bootstrap_ci(sleep_stats=sleep_stats_to_boot)
        if ci_method == "auto":
            parm_idx = self.auto_methods.query("ci == 'parm'").index.to_list()
            boot_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
            parm_vals = self._ci.loc[parm_idx, "parm"]
            boot_vals = self._ci.loc[boot_idx, "boot"]
            ci_vals = pd.concat([parm_vals, boot_vals])
        else:
            ci_vals = self._ci[ci_method]
        # Add an extra level to values columns, indicating they are the center interval
        centr_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
        summary = centr_vals.join(ci_vals, how="left", validate="1:1").astype(float)
        return summary.sort_index(axis=1)

[docs]    def calibrate(self, data, bias_method="auto", adjust_all=False):
        """
        Calibrate a :py:class:`~pandas.DataFrame` of sleep statistics from a new scorer based on
        observed biases in ``obs_data``/``obs_scorer``.

        Parameters
        ----------
        data : :py:class:`pandas.DataFrame`
            A :py:class:`pandas.DataFrame` with sleep statistics from an observed scorer.
            Rows are unique observations and columns are unique sleep statistics.
        bias_method : str
            If ``'parm'``, sleep statistics are always adjusted based on parametric bias.
            If ``'regr'``, sleep statistics are always adjusted based on regression-modeled bias.
            If ``'auto'`` (default), bias sleep statistics are adjusted by either ``'parm'`` or
            ``'regr'``, depending on assumption violations.

            .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.summary`

        adjust_all: bool
            If False (default), only adjust values for sleep statistics that showed a statistically
            significant bias in the ``obs_data``. If True, adjust values for all sleep statistics.

        Returns
        -------
        calibrated_data : :py:class:`pandas.DataFrame`
            A :py:class:`~pandas.DataFrame` with calibrated sleep statistics.

        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
        """
        assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
        assert all(
            col in self.sleep_statistics for col in data
        ), f"all columns of `data` must be valid sleep statistics: {self.sleep_statistics}"
        assert isinstance(bias_method, str), "`bias_method` must be a string"
        assert (
            bias_method in self._bias_method_opts
        ), f"`bias_method` must be one of {self._bias_method_opts}"
        assert isinstance(adjust_all, bool), "`adjust_all` must be True or False"
        parm_adjusted = data + self._vals["bias_parm"]
        regr_adjusted = data * self._vals["bias_slope"] + self._vals["bias_intercept"]
        if bias_method == "parm":
            calibrated_data = parm_adjusted
        elif bias_method == "regr":
            calibrated_data = regr_adjusted
        elif bias_method == "auto":
            parm_idx = self.auto_methods.query("bias == 'parm'").index.to_list()
            regr_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
            calibrated_data = parm_adjusted[parm_idx].join(regr_adjusted[regr_idx]).dropna(axis=1)
        if not adjust_all:
            # Put the raw values back for sleep stats that don't show statistical bias
            unbiased_sstats = self.assumptions.query("unbiased == True").index.to_list()
            calibrated_data[unbiased_sstats] = data[unbiased_sstats]
        return calibrated_data

[docs]    def get_calibration_func(self, sleep_stat):
        """
        Return a function for calibrating a specific sleep statistic, based on observed biases in
        ``obs_data``/``obs_scorer``.

        .. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`

        Examples
        --------
        >>> ssa = yasa.SleepStatsAgreement(...)
        >>> calibrate_rem = ssa.get_calibration_func("REM")
        >>> new_obs_rem_vals = np.array([50, 40, 30, 20])
        >>> calibrate_rem(new_obs_rem_vals)
        >>> calibrate_rem(new_obs_rem_vals)
        array([50, 40, 30, 20])
        >>> calibrate_rem(new_obs_rem_vals, bias_test=False)
        array([42.825, 32.825, 22.825, 12.825])
        >>> calibrate_rem(new_obs_rem_vals, bias_test=False, method="regr")
        array([ -9.33878878,  -9.86815607, -10.39752335, -10.92689064])
        """
        assert isinstance(sleep_stat, str), "`sleep_stat` must be a string"
        assert sleep_stat in self.sleep_statistics, "`sleep_stat` must be a valid sleep statistic"
        columns = ["bias_parm", "bias_slope", "bias_intercept"]
        parm, slope, intercept = self._vals.loc[sleep_stat, columns]
        auto_method = self.auto_methods.at[sleep_stat, "bias"]
        not_biased = self.assumptions.at[sleep_stat, "unbiased"]

        def calibration_func(x, method="auto", adjust_all=False):
            """Calibrate values for sleep statistic.

            Parameters
            ----------
            x : array
                Values to be calibrated
            method: str
                Method of bias calculation for calibration (``'parm'``, ``'regr'``, or ``'auto'``).
            adjust_all : bool
                If False, only adjust sleep stat if observed bias was statistically significant.

            Returns
            -------
            x_calibrated : :py:class:`numpy.array`
                An array of calibrated x values.
            """
            x = np.asarray(x)
            method = auto_method if method == "auto" else method
            if not_biased and not adjust_all:  # Return input if sleep stat is not statstclly biased
                return x
            elif method == "parm":
                return x + parm
            elif method == "regr":
                return x * slope + intercept

        return calibration_func