"""
YASA code for evaluating the agreement between two scorers (e.g., human vs YASA), either at the
epoch-by-epoch level or at the level of summary sleep statistics.
Analyses are influenced by the standardized framework proposed in Menghini et al., 2021, SLEEP.
See the following resources:
- https://doi.org/10.1093/sleep/zsaa170
- https://sri-human-sleep.github.io/sleep-trackers-performance
- https://github.com/SRI-human-sleep/sleep-trackers-performance
"""
import logging
import numpy as np
import pandas as pd
import scipy.stats as sps
import sklearn.metrics as skm
logger = logging.getLogger("yasa")
__all__ = [
"EpochByEpochAgreement",
"SleepStatsAgreement",
]
################################################################################
# EPOCH BY EPOCH
################################################################################
[docs]class EpochByEpochAgreement:
"""Evaluate agreement between two hypnograms or two collections of hypnograms.
Evaluation includes averaged agreement scores, one-vs-rest agreement scores, agreement scores
summarized across all sleep and summarized by sleep stage, and various plotting options to
visualize the two hypnograms simultaneously. See examples for more detail.
.. versionadded:: 0.7.0
Parameters
----------
ref_hyps : iterable of :py:class:`yasa.Hypnogram`
A collection of reference hypnograms (i.e., those considered ground-truth).
Each :py:class:`yasa.Hypnogram` in ``ref_hyps`` must have the same
:py:attr:`~yasa.Hypnogram.scorer`.
If a ``dict``, key values are use to generate unique sleep session IDs. If any other
iterable (e.g., ``list`` or ``tuple``), then unique sleep session IDs are automatically
generated.
obs_hyps : iterable of :py:class:`yasa.Hypnogram`
A collection of observed hypnograms (i.e., those to be evaluated).
Each :py:class:`yasa.Hypnogram` in ``obs_hyps`` must have the same
:py:attr:`~yasa.Hypnogram.scorer`, and this scorer must be different than the scorer of
hypnograms in ``ref_hyps``.
If a ``dict``, key values must match those of ``ref_hyps``.
.. important::
It is assumed that the order of hypnograms are the same in ``ref_hyps`` and ``obs_hyps``.
For example, the third hypnogram in ``ref_hyps`` and ``obs_hyps`` must come from the same
sleep session, and they must only differ in that they have different scorers.
.. seealso:: For comparing just two hypnograms, use :py:meth:`yasa.Hynogram.evaluate`.
Notes
-----
Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
References
----------
.. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
(2021). A standardized framework for testing the performance of sleep-tracking
technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
zsaa170. https://doi.org/10.1093/sleep/zsaa170
Examples
--------
>>> import yasa
>>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer="Human", seed=i) for i in range(10)]
>>> obs_hyps = [h.simulate_similar(scorer="YASA", seed=i) for i, h in enumerate(ref_hyps)]
>>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
>>> agr = ebe.get_agreement()
>>> agr.head(5).round(2)
accuracy balanced_acc kappa mcc precision recall f1
sleep_id
1 0.31 0.26 0.07 0.07 0.31 0.31 0.31
2 0.33 0.33 0.14 0.14 0.35 0.33 0.34
3 0.35 0.24 0.06 0.06 0.35 0.35 0.35
4 0.22 0.21 0.01 0.01 0.21 0.22 0.21
5 0.21 0.17 -0.06 -0.06 0.20 0.21 0.21
>>> ebe.get_agreement_bystage().head(12).round(3)
fbeta precision recall support
stage sleep_id
WAKE 1 0.391 0.371 0.413 189.0
2 0.299 0.276 0.326 184.0
3 0.234 0.204 0.275 255.0
4 0.268 0.285 0.252 321.0
5 0.228 0.230 0.227 181.0
6 0.407 0.384 0.433 284.0
7 0.362 0.296 0.467 287.0
8 0.298 0.519 0.209 263.0
9 0.210 0.191 0.233 313.0
10 0.369 0.420 0.329 362.0
N1 1 0.185 0.185 0.185 124.0
2 0.121 0.131 0.112 160.0
>>> ebe.get_confusion_matrix(sleep_id=1)
YASA WAKE N1 N2 N3 REM
Human
WAKE 78 24 50 3 34
N1 23 23 43 15 20
N2 60 58 183 43 139
N3 30 10 50 5 32
REM 19 9 121 50 78
.. plot::
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(figsize=(6, 3), constrained_layout=True)
>>> ebe.plot_hypnograms(sleep_id=10)
.. plot::
>>> fig, ax = plt.subplots(figsize=(6, 3))
>>> ebe.plot_hypnograms(
>>> sleep_id=8, ax=ax, obs_kwargs={"color": "red", "lw": 2, "ls": "dotted"}
>>> )
>>> plt.tight_layout()
.. plot::
>>> session = 8
>>> fig, ax = plt.subplots(figsize=(6.5, 2.5), constrained_layout=True)
>>> style_a = dict(alpha=1, lw=2.5, ls="solid", color="gainsboro", label="Michel")
>>> style_b = dict(alpha=1, lw=2.5, ls="solid", color="cornflowerblue", label="Jouvet")
>>> legend_style = dict(
>>> title="Scorer", frameon=False, ncol=2, loc="lower center", bbox_to_anchor=(0.5, 0.9)
>>> )
>>> ax = ebe.plot_hypnograms(
>>> sleep_id=session, ref_kwargs=style_a, obs_kwargs=style_b, legend=legend_style, ax=ax
>>> )
>>> acc = ebe.get_agreement().multiply(100).at[session, "accuracy"]
>>> ax.text(
>>> 0.01, 1, f"Accuracy = {acc:.0f}%", ha="left", va="bottom", transform=ax.transAxes
>>> )
When comparing only 2 hypnograms, use the :py:meth:`~yasa.Hynogram.evaluate` method:
>>> hypno_a = yasa.simulate_hypnogram(tib=90, scorer="RaterA", seed=8)
>>> hypno_b = hypno_a.simulate_similar(scorer="RaterB", seed=9)
>>> ebe = hypno_a.evaluate(hypno_b)
>>> ebe.get_confusion_matrix()
RaterB WAKE N1 N2 N3
RaterA
WAKE 71 2 20 8
N1 1 0 9 0
N2 12 4 25 0
N3 24 0 1 3
"""
[docs] def __init__(self, ref_hyps, obs_hyps):
from yasa.hypno import Hypnogram # Avoiding circular import, bc hypno imports this class
assert hasattr(ref_hyps, "__iter__"), "`ref_hyps` must be a an iterable"
assert hasattr(obs_hyps, "__iter__"), "`obs_hyps` must be a an iterable"
assert type(ref_hyps) is type(obs_hyps), "`ref_hyps` and `obs_hyps` must be the same type"
assert len(ref_hyps) == len(
obs_hyps
), "`ref_hyps` and `obs_hyps` must have the same number of hypnograms"
if isinstance(ref_hyps, dict):
# If user provides dictionaries, split into sleep IDs and hypnograms
assert (
ref_hyps.keys() == obs_hyps.keys()
), "keys in `ref_hyps` must be the same as keys in `obs_hyps`"
sleep_ids, ref_hyps = zip(*ref_hyps.items())
obs_hyps = tuple(obs_hyps.values())
else:
# Create hypnogram_ids
sleep_ids = tuple(range(1, 1 + len(ref_hyps)))
assert all(
isinstance(hyp, Hypnogram) for hyp in ref_hyps + obs_hyps
), "`ref_hyps` and `obs_hyps` must only contain YASA hypnograms"
assert all(
h.scorer is not None for h in ref_hyps + obs_hyps
), "all hypnograms in `ref_hyps` and `obs_hyps` must have a scorer name"
for h1, h2 in zip((ref_hyps + obs_hyps)[:-1], (ref_hyps + obs_hyps)[1:]):
assert h1.freq == h2.freq, "all hypnograms must have the same freq"
assert h1.labels == h2.labels, "all hypnograms must have the same labels"
assert h1.mapping == h2.mapping, "all hypnograms must have the same mapping"
assert h1.n_stages == h2.n_stages, "all hypnograms must have the same n_stages"
assert all(
h1.scorer == h2.scorer for h1, h2 in zip(ref_hyps[:-1], ref_hyps[1:])
), "all `ref_hyps` must have the same scorer"
assert all(
h1.scorer == h2.scorer for h1, h2 in zip(obs_hyps[:-1], obs_hyps[1:])
), "all `obs_hyps` must have the same scorer"
assert all(
h1.scorer != h2.scorer for h1, h2 in zip(ref_hyps, obs_hyps)
), "each `ref_hyps` and `obs_hyps` pair must have unique scorers"
assert all(
h1.n_epochs == h2.n_epochs for h1, h2 in zip(ref_hyps, obs_hyps)
), "each `ref_hyps` and `obs_hyps` pair must have the same n_epochs"
# Convert ref_hyps and obs_hyps to dictionaries with sleep_id keys and hypnogram values
ref_hyps = {s: h for s, h in zip(sleep_ids, ref_hyps)}
obs_hyps = {s: h for s, h in zip(sleep_ids, obs_hyps)}
# Merge all hypnograms into a single MultiIndexed dataframe
ref = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in ref_hyps.items())
obs = pd.concat(pd.concat({s: h.as_int()}, names=["sleep_id"]) for s, h in obs_hyps.items())
data = pd.concat([ref, obs], axis=1)
# Generate some mapping dictionaries to be used later in class methods
skm_labels = np.unique(data).tolist() # all unique YASA integer codes in this hypno
skm2yasa_map = {i: l for i, l in enumerate(skm_labels)} # skm order to YASA integers
yasa2yasa_map = ref_hyps[sleep_ids[0]].mapping_int.copy() # YASA integer to YASA string
# Set attributes
self._data = data
self._sleep_ids = sleep_ids
self._ref_hyps = ref_hyps
self._obs_hyps = obs_hyps
self._ref_scorer = ref_hyps[sleep_ids[0]].scorer
self._obs_scorer = obs_hyps[sleep_ids[0]].scorer
self._skm_labels = skm_labels
self._skm2yasa_map = skm2yasa_map
self._yasa2yasa_map = yasa2yasa_map
def __repr__(self):
# TODO v0.8: Keep only the text between < and >
s = "s" if self.n_sleeps > 1 else ""
return (
f"<EpochByEpochAgreement | Observed hypnogram{s} scored by {self.obs_scorer} "
f"evaluated against reference hypnogram{s} scored by {self.ref_scorer}, "
f"{self.n_sleeps} sleep session{s}>\n"
" - Use `.get_agreement()` to get agreement measures as a pandas DataFrame or Series\n"
" - Use `.plot_hypnograms()` to plot two overlaid hypnograms\n"
"See the online documentation for more details."
)
def __str__(self):
return self.__repr__()
@property
def data(self):
"""A :py:class:`pandas.DataFrame` including all hypnograms."""
return self._data
@property
def n_sleeps(self):
"""The number of unique sleep sessions."""
return len(self._sleep_ids)
@property
def ref_scorer(self):
"""The name of the reference scorer."""
return self._ref_scorer
@property
def obs_scorer(self):
"""The name of the observed scorer."""
return self._obs_scorer
[docs] @staticmethod
def multi_scorer(df, scorers):
"""
Compute multiple agreement scores from a 2-column dataframe (an optional 3rd column may
contain sample weights).
This function offers convenience when calculating multiple agreement scores using
:py:meth:`pandas.DataFrame.groupby.apply`. Scikit-learn doesn't include a function that
returns multiple scores, and the GroupBy implementation of ``apply`` in pandas does not
accept multiple functions.
Parameters
----------
df : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` with 2 columns and length of *n_samples*.
The first column contains reference values and second column contains observed values.
If a third column, it must contain sample weights to be passed to underlying
:py:mod:`sklearn.metrics` functions as ``sample_weight`` where applicable.
scorers : dictionary
The scorers to be used for evaluating agreement. A dictionary with scorer names (str) as
keys and functions as values.
Returns
-------
scores : dict
A dictionary with scorer names (``str``) as keys and scores (``float``) as values.
"""
assert isinstance(df, pd.DataFrame), "`df` must be a pandas DataFrame"
assert df.shape[1] in [2, 3], "`df` must have either 2 or 3 columns"
assert isinstance(scorers, dict), "`scorers` must be a dictionary"
assert all(
isinstance(k, str) and callable(v) for k, v in scorers.items()
), "Each key of `scorers` must be a string, and each value must be a callable function"
if df.shape[1] == 3:
true, pred, weights = zip(*df.values)
elif df.shape[1] == 2:
true, pred = zip(*df.values) # Same as (df["col1"], df["col2"]) but teensy bit faster
weights = None
scores = {s: f(true, pred, weights) for s, f in scorers.items()}
return scores
[docs] def get_agreement(self, sample_weight=None, scorers=None):
"""
Return a :py:class:`pandas.DataFrame` of weighted (i.e., averaged) agreement scores.
Parameters
----------
self : :py:class:`~yasa.evaluation.EpochByEvaluation`
A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
sample_weight : None or :py:class:`pandas.Series`
Sample weights passed to underlying :py:mod:`sklearn.metrics` functions where possible.
If a :py:class:`pandas.Series`, the index must match exactly that of
:py:attr:`~yasa.Hypnogram.data`.
scorers : None, list, or dictionary
The scorers to be used for evaluating agreement. If None (default), default scorers are
used. If a list, the list must contain strings that represent metrics from the sklearn
metrics module (e.g., ``accuracy``, ``precision``). If more customization is desired, a
dictionary can be passed with scorer names (str) as keys and custom functions as values.
The custom functions should take 3 positional arguments (true values, predicted values,
and sample weights).
Returns
-------
agreement : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` with agreement metrics as columns and sessions as rows.
"""
assert isinstance(
sample_weight, (type(None), pd.Series)
), "`sample_weight` must be None or pandas Series"
assert isinstance(scorers, (type(None), list, dict))
if isinstance(scorers, list):
assert all(isinstance(x, str) for x in scorers)
elif isinstance(scorers, dict):
assert all(isinstance(k, str) and callable(v) for k, v in scorers.items())
if scorers is None:
# Create dictionary of default scorer functions
scorers = {
"accuracy": lambda t, p, w: skm.accuracy_score(
t, p, normalize=True, sample_weight=w
),
"balanced_acc": lambda t, p, w: skm.balanced_accuracy_score(
t, p, adjusted=False, sample_weight=w
),
"kappa": lambda t, p, w: skm.cohen_kappa_score(
t, p, labels=None, weights=None, sample_weight=w
),
"mcc": lambda t, p, w: skm.matthews_corrcoef(t, p, sample_weight=w),
"precision": lambda t, p, w: skm.precision_score(
t, p, average="weighted", sample_weight=w, zero_division=0
),
"recall": lambda t, p, w: skm.recall_score(
t, p, average="weighted", sample_weight=w, zero_division=0
),
"f1": lambda t, p, w: skm.f1_score(
t, p, average="weighted", sample_weight=w, zero_division=0
),
}
elif isinstance(scorers, list):
# Convert the list to a dictionary of sklearn scorers
scorers = {s: skm.__getattribute__(f"{s}_scorer") for s in scorers}
# Make a copy of data since weights series might be added to it
df = self.data.copy()
if sample_weight is not None:
assert sample_weight.index == self.data.index, (
"If not `None`, `sample_weight` Series must be a pandas Series with the same index "
"as `self.data`"
)
# Add weights as a third column for multi_scorer to use
df["weights"] = sample_weight
# Get individual-level averaged/weighted agreement scores
agreement = df.groupby(level=0).apply(self.multi_scorer, scorers=scorers).apply(pd.Series)
# Set attribute for later access
self._agreement = agreement
# Convert to Series if just one session being evaluated
if self.n_sleeps == 1:
agreement = agreement.squeeze().rename("agreement")
return agreement
[docs] def get_agreement_bystage(self, beta=1.0):
"""
Return a :py:class:`pandas.DataFrame` of unweighted (i.e., one-vs-rest) agreement scores.
Parameters
----------
self : :py:class:`~yasa.evaluation.EpochByEvaluation`
A :py:class:`~yasa.evaluation.EpochByEvaluation` instance.
beta : float
See :py:func:`sklearn.metrics.precision_recall_fscore_support`.
Returns
-------
agreement : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` with agreement metrics as columns and a
:py:class:`~pandas.MultiIndex` with session and sleep stage as rows.
"""
def scorer(df):
return skm.precision_recall_fscore_support(
*df.values.T, beta=beta, labels=self._skm_labels, average=None, zero_division=0
)
agreement = (
self.data
# Get precision, recall, f1, and support for each individual sleep session
.groupby(level=0)
.apply(scorer)
# Unpack arrays
.explode()
.apply(pd.Series)
# Add metric labels column and prepend it to index, creating MultiIndex
.assign(metric=["precision", "recall", "fbeta", "support"] * self.n_sleeps)
.set_index("metric", append=True)
# Convert stage column names to string labels
.rename_axis(columns="stage")
.rename(columns=self._skm2yasa_map)
.rename(columns=self._yasa2yasa_map)
# Remove all-zero columns (i.e., stages that were not present in the hypnogram)
.pipe(lambda df: df.loc[:, df.any()])
# Reshape so metrics are columns
.stack()
.unstack("metric")
.rename_axis(columns=None)
# Swap MultiIndex levels and sort so stages are in standard YASA order
.swaplevel()
.sort_index(
level="stage",
key=lambda x: x.map(lambda y: list(self._yasa2yasa_map.values()).index(y)),
)
)
# Set attribute for later access
self._agreement_bystage = agreement
# Remove the MultiIndex if just one session being evaluated
if self.n_sleeps == 1:
agreement = agreement.reset_index(level=1, drop=True)
return agreement
[docs] def get_confusion_matrix(self, sleep_id=None, agg_func=None, **kwargs):
"""
Return a ``ref_hyp``/``obs_hyp``confusion matrix from either a single session or all
sessions concatenated together.
Parameters
----------
self : :py:class:`yasa.EpochByEpochAgreement`
A :py:class:`yasa.EpochByEpochAgreement` instance.
sleep_id : None or a valid sleep ID
If None (default), cross-tabulation is derived from the entire group dataset.
If a valid sleep ID, cross-tabulation is derived using only the reference and observed
scored hypnograms from that sleep session.
agg_func : None or str
If None (default), group results returns a :py:class:`~pandas.DataFrame` complete with
all individual session results. If not None, group results returns a
:py:class:`~pandas.DataFrame` aggregated across sessions where ``agg_func`` is passed as
``func`` parameter in :py:meth:`pandas.DataFrame.groupby.agg`. For example, set
``agg_func="sum"`` to get a single confusion matrix across all epochs that does not take
session into account.
**kwargs : key, value pairs
Additional keyword arguments are passed to :py:func:`sklearn.metrics.confusion_matrix`.
Returns
-------
conf_matr : :py:class:`pandas.DataFrame`
A confusion matrix with stages from the reference scorer as indices and stages from the
test scorer as columns.
Examples
--------
>>> import yasa
>>> ref_hyps = [yasa.simulate_hypnogram(tib=90, scorer="Rater1", seed=i) for i in range(3)]
>>> obs_hyps = [h.simulate_similar(scorer="Rater2", seed=i) for i, h in enumerate(ref_hyps)]
>>> ebe = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
>>> ebe.get_confusion_matrix(sleep_id=2)
Rater2 WAKE N1 N2 N3 REM
Rater1
WAKE 1 2 23 0 0
N1 0 9 13 0 0
N2 0 6 71 0 0
N3 0 13 42 0 0
REM 0 0 0 0 0
>>> ebe.get_confusion_matrix()
Rater2 WAKE N1 N2 N3 REM
sleep_id Rater1
1 WAKE 30 0 3 0 35
N1 3 2 7 0 0
N2 21 12 7 0 4
N3 0 0 0 0 0
REM 2 8 29 0 17
2 WAKE 1 2 23 0 0
N1 0 9 13 0 0
N2 0 6 71 0 0
N3 0 13 42 0 0
REM 0 0 0 0 0
3 WAKE 16 0 7 19 19
N1 0 7 2 0 5
N2 0 10 12 7 5
N3 0 0 16 11 0
REM 0 15 11 18 0
>>> ebe.get_confusion_matrix(agg_func="sum")
Rater2 WAKE N1 N2 N3 REM
Rater1
WAKE 47 2 33 19 54
N1 3 18 22 0 5
N2 21 28 90 7 9
N3 0 13 58 11 0
REM 2 23 40 18 17
"""
assert (
sleep_id is None or sleep_id in self._sleep_ids
), "`sleep_id` must be None or a valid sleep ID"
assert isinstance(agg_func, (type(None), str)), "`agg_func` must be None or a str"
assert not (
(self.n_sleeps == 1 or sleep_id is not None) and agg_func is not None
), "`agg_func` must be None if plotting a single session."
kwargs = {"labels": self._skm_labels} | kwargs
# Generate a DataFrame with a confusion matrix for each session
# Seems easier to just generate this whole thing and then either
# extract a single one or aggregate across them all, depending on user request
confusion_matrices = (
self.data
# Get confusion matrix for each individual sleep session
.groupby(level=0)
.apply(lambda df: skm.confusion_matrix(*df.values.T, **kwargs))
# Expand results matrix out from single cell
.explode()
.apply(pd.Series)
# Convert to MultiIndex with reference scorer as new level
.assign(**{self.ref_scorer: self._skm_labels * self.n_sleeps})
.set_index(self.ref_scorer, append=True)
.rename_axis(columns=self.obs_scorer)
# Convert sleep stage columns and indices to strings
.rename(columns=self._skm2yasa_map)
.rename(columns=self._yasa2yasa_map)
.rename(index=self._skm2yasa_map, level=self.ref_scorer)
.rename(index=self._yasa2yasa_map, level=self.ref_scorer)
)
if self.n_sleeps == 1:
# If just one session, use the only session ID as the key, for simplified returned df
sleep_id = self._sleep_ids[0]
if sleep_id is None:
if agg_func is None:
mat = confusion_matrices
else:
mat = confusion_matrices.groupby(self.ref_scorer, sort=False).agg(agg_func)
else:
mat = confusion_matrices.loc[sleep_id]
return mat
[docs] def get_sleep_stats(self):
"""
Return a :py:class:`pandas.DataFrame` of sleep statistics for each hypnogram derived from
both reference and observed scorers.
.. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
.. seealso:: :py:class:`yasa.SleepStatsAgreement`
Parameters
----------
self : :py:class:`yasa.EpochByEpochAgreement`
A :py:class:`yasa.EpochByEpochAgreement` instance.
Returns
-------
sstats : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` with sleep statistics as columns and two rows for each
individual (one for reference scorer and another for test scorer).
"""
# Get all sleep statistics
ref_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._ref_hyps.items()})
obs_sstats = pd.DataFrame({s: h.sleep_statistics() for s, h in self._obs_hyps.items()})
# Reshape and name axis
ref_sstats = ref_sstats.T.rename_axis("sleep_id")
obs_sstats = obs_sstats.T.rename_axis("sleep_id")
# Convert to MultiIndex with new scorer level
ref_sstats = pd.concat({self.ref_scorer: ref_sstats}, names=["scorer"])
obs_sstats = pd.concat({self.obs_scorer: obs_sstats}, names=["scorer"])
# Concatenate into one DataFrame
sstats = pd.concat([ref_sstats, obs_sstats])
# Remove the MultiIndex if just one session being evaluated
if self.n_sleeps == 1:
sstats = sstats.reset_index(level=1, drop=True)
return sstats
[docs] def plot_hypnograms(self, sleep_id=None, legend=True, ax=None, ref_kwargs={}, obs_kwargs={}):
"""Plot the two hypnograms of one session overlapping on the same axis.
.. seealso:: :py:func:`yasa.plot_hypnogram`
Parameters
----------
self : :py:class:`yasa.EpochByEpochAgreement`
A :py:class:`yasa.EpochByEpochAgreement` instance.
sleep_id : a valid sleep ID or None
The sleep session to plot. If multiple sessions are included in the
:py:class:`~yasa.EpochByEpochAgreement` instance, a ``sleep_id`` must be provided. If
only one session is present, ``None`` (default) will plot the two hypnograms of the
only session.
legend : bool or dict
If True (default) or a dictionary, a legend is added. If a dictionary, all key/value
pairs are passed as keyword arguments to the :py:func:`matplotlib.pyplot.legend` call.
ax : :py:class:`matplotlib.axes.Axes` or None
Axis on which to draw the plot, optional.
ref_kwargs : dict
Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the reference
hypnogram.
obs_kwargs : dict
Keyword arguments passed to :py:func:`yasa.plot_hypnogram` when plotting the observed
hypnogram.
Returns
-------
ax : :py:class:`matplotlib.axes.Axes`
Matplotlib Axes
Examples
--------
.. plot::
>>> from yasa import simulate_hypnogram
>>> hyp = simulate_hypnogram(scorer="Anthony", seed=19)
>>> ax = hyp.evaluate(hyp.simulate_similar(scorer="Alan", seed=68)).plot_hypnograms()
"""
assert (
sleep_id is None or sleep_id in self._sleep_ids
), "`sleep_id` must be None or a valid sleep ID"
assert isinstance(legend, (bool, dict)), "`legend` must be True, False, or a dictionary"
assert isinstance(ref_kwargs, dict), "`ref_kwargs` must be a dictionary"
assert isinstance(obs_kwargs, dict), "`obs_kwargs` must be a dictionary"
assert (
"ax" not in ref_kwargs | obs_kwargs
), "'ax' can't be supplied to `ref_kwargs` or `obs_kwargs`, use the `ax` keyword instead"
assert not (sleep_id is None and self.n_sleeps > 1), (
"Multi-session plotting is not currently supported. `sleep_id` must not be None when "
"multiple sessions are present"
)
# Select the session hypnograms to plot
if sleep_id is None and self.n_sleeps == 1:
ref_hyp = self._ref_hyps[self._sleep_ids[0]]
obs_hyp = self._obs_hyps[self._sleep_ids[0]]
else:
ref_hyp = self._ref_hyps[sleep_id]
obs_hyp = self._obs_hyps[sleep_id]
# Set default plotting kwargs and merge with user kwargs
plot_ref_kwargs = {
"label": self.ref_scorer,
"highlight": None,
"color": "black",
"alpha": 0.8,
}
plot_obs_kwargs = {
"label": self.obs_scorer,
"highlight": None,
"color": "green",
"alpha": 0.8,
"ls": "dashed",
}
plot_ref_kwargs.update(ref_kwargs)
plot_obs_kwargs.update(obs_kwargs)
# Draw the hypnograms
ax = ref_hyp.plot_hypnogram(ax=ax, **plot_ref_kwargs)
ax = obs_hyp.plot_hypnogram(ax=ax, **plot_obs_kwargs)
# Add legend if desired
if legend:
if isinstance(legend, dict):
ax.legend(**legend)
else:
ax.legend()
return ax
[docs] def summary(self, by_stage=False, **kwargs):
"""Return group-level agreement scores.
Default aggregated measures are
Parameters
----------
self : :py:class:`~yasa.evaluation.EpochByEpochAgreement`
A :py:class:`~yasa.evaluation.EpochByEpochAgreement` instance.
by_stage : bool
If ``False`` (default), ``summary`` will include agreement scores derived from
average-based metrics. If ``True``, returned ``summary`` :py:class:`~pandas.DataFrame`
will include agreement scores for each sleep stage, derived from one-vs-rest metrics.
**kwargs : key, value pairs
Additional keyword arguments are passed to :py:meth:`pandas.DataFrame.groupby.agg`.
This can be used to customize the descriptive statistics returned.
Returns
-------
summary : :py:class:`pandas.DataFrame`
A :py:class:`pandas.DataFrame` summarizing agreement scores across the entire dataset
with descriptive statistics.
>>> ebe = yasa.EpochByEpochAgreement(...)
>>> agreement = ebe.get_agreement()
>>> ebe.summary()
This will give a :py:class:`~pandas.DataFrame` where each row is an agreement metric and
each column is a descriptive statistic (e.g., mean, standard deviation).
To control the descriptive statistics included as columns:
>>> ebe.summary(func=["count", "mean", "sem"])
"""
assert self.n_sleeps > 1, "Summary scores can not be computed with only one hypnogram pair."
assert isinstance(by_stage, bool), "`by_stage` must be True or False"
if by_stage:
assert hasattr(
self, "_agreement_bystage"
), "Must run `self.get_agreement_bystage` before obtaining by_stage summary results."
else:
assert hasattr(
self, "_agreement"
), "Must run `self.get_agreement` before obtaining summary results."
# Create a function for getting mean absolute deviation
def mad(df):
return (df - df.mean()).abs().mean()
# Merge default and user kwargs
agg_kwargs = {"func": [mad, "mean", "std", "min", "median", "max"]} | kwargs
if by_stage:
summary = (
self.agreement_bystage.groupby("stage")
.agg(**agg_kwargs)
.stack(level=0)
.rename_axis(["stage", "metric"])
)
else:
summary = self._agreement.agg(**agg_kwargs).T.rename_axis("metric")
return summary
################################################################################
# SLEEP STATISTICS
################################################################################
[docs]class SleepStatsAgreement:
"""
Evaluate agreement between sleep statistics reported by two different scorers.
Evaluation includes bias and limits of agreement (as well as both their confidence intervals),
various plotting options, and calibration functions for correcting biased values from the
observed scorer.
Features include:
* Get summary calculations of bias, limits of agreement, and their confidence intervals.
* Test statistical assumptions of bias, limits of agreement, and their confidence intervals,
and apply corrective procedures when the assumptions are not met.
* Get bias and limits of agreement in a string-formatted table.
* Calibrate new data to correct for biases in observed data.
* Return individual calibration functions.
* Visualize discrepancies for outlier inspection.
* Visualize Bland-Altman plots.
.. seealso:: :py:meth:`yasa.Hypnogram.sleep_statistics`
.. versionadded:: 0.7.0
Parameters
----------
ref_data : :py:class:`pandas.DataFrame`
A :py:class:`pandas.DataFrame` with sleep statistics from the reference scorer.
Rows are unique observations and columns are unique sleep statistics.
obs_data : :py:class:`pandas.DataFrame`
A :py:class:`pandas.DataFrame` with sleep statistics from the observed scorer.
Rows are unique observations and columns are unique sleep statistics.
Shape, index, and columns must be identical to ``ref_data``.
ref_scorer : str
Name of the reference scorer.
obs_scorer : str
Name of the observed scorer.
agreement : float
Multiple of the standard deviation to plot agreement limits. The default is 1.96, which
corresponds to a 95% confidence interval if the differences are normally distributed.
.. note:: ``agreement`` gets adjusted for regression-modeled limits of agreement.
confidence : float
The percentage confidence interval for the confidence intervals that are applied to bias and
limits of agreement. The same confidence interval percentage is applied to both standard and
bootstrapped confidence intervals.
alpha : float
Alpha cutoff used for all assumption tests.
verbose : bool or str
Verbose level. Default (False) will only print warning and error messages. The logging
levels are 'debug', 'info', 'warning', 'error', and 'critical'. For most users the choice is
between 'info' (or ``verbose=True``) and warning (``verbose=False``).
Notes
-----
Sleep statistics that are identical between scorers are removed from analysis.
Many steps here are influenced by guidelines proposed in Menghini et al., 2021 [Menghini2021]_.
See https://sri-human-sleep.github.io/sleep-trackers-performance/AnalyticalPipeline_v1.0.0.html
References
----------
.. [Menghini2021] Menghini, L., Cellini, N., Goldstone, A., Baker, F. C., & de Zambotti, M.
(2021). A standardized framework for testing the performance of sleep-tracking
technology: step-by-step guidelines and open-source code. SLEEP, 44(2),
zsaa170. https://doi.org/10.1093/sleep/zsaa170
Examples
--------
>>> import pandas as pd
>>> import yasa
>>>
>>> # Generate fake reference and observed datasets with similar sleep statistics
>>> ref_scorer = "Henri"
>>> obs_scorer = "Piéron"
>>> ref_hyps = [yasa.simulate_hypnogram(tib=600, scorer=ref_scorer, seed=i) for i in range(20)]
>>> obs_hyps = [h.simulate_similar(scorer=obs_scorer, seed=i) for i, h in enumerate(ref_hyps)]
>>> # Generate sleep statistics from hypnograms using EpochByEpochAgreement
>>> eea = yasa.EpochByEpochAgreement(ref_hyps, obs_hyps)
>>> sstats = eea.get_sleep_stats()
>>> ref_sstats = sstats.loc[ref_scorer]
>>> obs_sstats = sstats.loc[obs_scorer]
>>> # Create SleepStatsAgreement instance
>>> ssa = yasa.SleepStatsAgreement(ref_sstats, obs_sstats)
>>> ssa.summary().round(1).head(3)
variable bias_intercept ... uloa_parm
interval center lower upper ... center lower upper
sleep_stat ...
%N1 -5.4 -13.9 3.2 ... 6.1 3.7 8.5
%N2 -27.3 -49.1 -5.6 ... 12.4 7.2 17.6
%N3 -9.1 -23.8 5.5 ... 20.4 12.6 28.3
>>> ssa.get_table().head(3)[["bias", "loa"]]
bias loa
sleep_stat
%N1 0.25 Bias ± 2.46 * (-0.00 + 1.00x)
%N2 -27.34 + 0.55x Bias ± 2.46 * (0.00 + 1.00x)
%N3 1.38 Bias ± 2.46 * (0.00 + 1.00x)
>>> ssa.assumptions.head(3)
unbiased normal constant_bias homoscedastic
sleep_stat
%N1 True True True False
%N2 True True False False
%N3 True True True False
>>> ssa.auto_methods.head(3)
bias loa ci
sleep_stat
%N1 parm regr parm
%N2 regr regr parm
%N3 parm regr parm
>>> ssa.get_table(bias_method="parm", loa_method="parm").head(3)[["bias", "loa"]]
bias loa
sleep_stat
%N1 0.25 -5.55, 6.06
%N2 -0.23 -12.87, 12.40
%N3 1.38 -17.67, 20.44
>>> new_hyps = [h.simulate_similar(scorer="Kelly", seed=i) for i, h in enumerate(obs_hyps)]
>>> new_sstats = pd.Series(new_hyps).map(lambda h: h.sleep_statistics()).apply(pd.Series)
>>> new_sstats = new_sstats[["N1", "TST", "WASO"]]
>>> new_sstats.round(1).head(5)
N1 TST WASO
0 42.5 439.5 147.5
1 84.0 550.0 38.5
2 53.5 489.0 103.0
3 57.0 469.5 120.0
4 71.0 531.0 69.0
>>> new_stats_calibrated = ssa.calibrate_stats(new_sstats, bias_method="auto")
>>> new_stats_calibrated.round(1).head(5)
N1 TST WASO
0 42.9 433.8 150.0
1 84.4 544.2 41.0
2 53.9 483.2 105.5
3 57.4 463.8 122.5
4 71.4 525.2 71.5
.. plot::
>>> import matplotlib.pyplot as plt
>>> ax = ssa.plot_discrepancies_heatmap()
>>> ax.set_title("Sleep statistic discrepancies")
>>> plt.tight_layout()
.. plot::
>>> ssa.plot_blandaltman()
"""
[docs] def __init__(
self,
ref_data,
obs_data,
*,
ref_scorer="Reference",
obs_scorer="Observed",
agreement=1.96,
confidence=0.95,
alpha=0.05,
verbose=True,
bootstrap_kwargs={},
):
restricted_bootstrap_kwargs = ["confidence_level", "vectorized", "paired"]
assert isinstance(ref_data, pd.DataFrame), "`ref_data` must be a pandas DataFrame"
assert isinstance(obs_data, pd.DataFrame), "`obs_data` must be a pandas DataFrame"
assert np.array_equal(
ref_data.index, obs_data.index
), "`ref_data` and `obs_data` index values must be identical"
assert (
ref_data.index.name == obs_data.index.name
), "`ref_data` and `obs_data` index names must be identical"
assert np.array_equal(
ref_data.columns, obs_data.columns
), "`ref_data` and `obs_data` column values must be identical"
assert isinstance(ref_scorer, str), "`ref_scorer` must be a string"
assert isinstance(obs_scorer, str), "`obs_scorer` must be a string"
assert ref_scorer != obs_scorer, "`ref_scorer` and `obs_scorer` must be unique"
assert (
isinstance(agreement, (float, int)) and agreement > 0
), "`agreement` must be a number greater than 0"
assert (
isinstance(confidence, (float, int)) and 0 < alpha < 1
), "`confidence` must be a number between 0 and 1"
assert (
isinstance(alpha, (float, int)) and 0 <= alpha <= 1
), "`alpha` must be a number between 0 and 1 inclusive"
assert isinstance(bootstrap_kwargs, dict), "`bootstrap_kwargs` must be a dictionary"
assert all(
k not in restricted_bootstrap_kwargs for k in bootstrap_kwargs
), f"None of {restricted_bootstrap_kwargs} can be set by the user"
# If `ref_data` and `obs_data` indices are unnamed, name them
session_key = "session_id" if ref_data.index.name is None else ref_data.index.name
ref_data.index.name = obs_data.index.name = session_key
# Reshape to long format DataFrame with 2 columns (observed, reference) and MultiIndex
data = (
pd.concat([obs_data, ref_data], keys=[obs_scorer, ref_scorer], names=["scorer"])
.melt(var_name="sleep_stat", ignore_index=False)
.pivot_table(index=["sleep_stat", session_key], columns="scorer", values="value")
.rename_axis(columns=None)
.sort_index()
)
# Get scorer differences (i.e., observed minus reference)
data["difference"] = data[obs_scorer] - data[ref_scorer]
# Remove sleep statistics that have no differences between scorers
stats_rm = data.groupby("sleep_stat")["difference"].any().loc[lambda x: ~x].index.tolist()
data = data.drop(labels=stats_rm)
for s in stats_rm:
logger.warning(f"Removed {s} from evaluation because all scorings were identical.")
# Create grouper and n_sessions variables for convenience
grouper = data.groupby("sleep_stat")
n_sessions = data.index.get_level_values(session_key).nunique()
########################################################################
# Generate parametric Bias and LoA for all sleep stats
########################################################################
# Parametric Bias
parm_vals = grouper["difference"].mean().to_frame("bias_parm")
# Parametric LoA
parm_vals["lloa_parm"], parm_vals["uloa_parm"] = zip(
*grouper["difference"].apply(self._arr_to_loa, agreement=agreement)
)
########################################################################
# Generate standard CIs for parametric Bias and LoA for all sleep stats
########################################################################
# Get critical t and standard error used to calculate parametric CIs for parametric Bias/LoA
t_parm = sps.t.ppf((1 + confidence) / 2, n_sessions - 1)
sem = grouper["difference"].sem(ddof=1)
# Parametric CIs for parametric Bias and LoA
parm_ci = pd.DataFrame(
{
"bias_parm-lower": parm_vals["bias_parm"] - sem * t_parm,
"bias_parm-upper": parm_vals["bias_parm"] + sem * t_parm,
"lloa_parm-lower": parm_vals["lloa_parm"] - sem * t_parm * np.sqrt(3),
"lloa_parm-upper": parm_vals["lloa_parm"] + sem * t_parm * np.sqrt(3),
"uloa_parm-lower": parm_vals["uloa_parm"] - sem * t_parm * np.sqrt(3),
"uloa_parm-upper": parm_vals["uloa_parm"] + sem * t_parm * np.sqrt(3),
}
)
########################################################################
# Generate regression/modeled (slope and intercept) Bias and LoA for all sleep stats
########################################################################
# Run regression used to (a) model bias and (b) test for proportional/constant bias
bias_regr = grouper[[ref_scorer, "difference"]].apply(self._linregr_dict).apply(pd.Series)
# Get absolute residuals from this regression bc they are used in the next regression
idx = data.index.get_level_values("sleep_stat")
slopes = bias_regr.loc[idx, "slope"].to_numpy()
intercepts = bias_regr.loc[idx, "intercept"].to_numpy()
predicted_values = data[ref_scorer].to_numpy() * slopes + intercepts
data["residuals"] = data[obs_scorer].to_numpy() - predicted_values
data["residuals_abs"] = data["residuals"].abs()
# Run regression used to (a) model LoA and (b) test for heteroscedasticity/homoscedasticity
loa_regr = grouper[[ref_scorer, "residuals_abs"]].apply(self._linregr_dict).apply(pd.Series)
# Stack the two regression dataframes together
regr = pd.concat({"bias": bias_regr, "loa": loa_regr}, axis=0)
########################################################################
# Generate parametric CIs for regression/modeled Bias and LoA for all sleep stats
########################################################################
# Get critical t used used to calculate parametric CIs for regression Bias/LoA
t_regr = sps.t.ppf((1 + confidence) / 2, n_sessions - 2) # dof=n-2 for regression
# Parametric CIs for modeled Bias and LoA
regr_ci = pd.DataFrame(
{
"intercept-lower": regr["intercept"] - regr["intercept_stderr"] * t_regr,
"intercept-upper": regr["intercept"] + regr["intercept_stderr"] * t_regr,
"slope-lower": regr["slope"] - regr["stderr"] * t_regr,
"slope-upper": regr["slope"] + regr["stderr"] * t_regr,
}
)
########################################################################
# Test all statistical assumptions
########################################################################
assumptions = pd.DataFrame(
{
"unbiased": (
grouper["difference"].apply(lambda a: sps.ttest_1samp(a, 0).pvalue).ge(alpha)
),
"normal": grouper["difference"].apply(lambda a: sps.shapiro(a).pvalue).ge(alpha),
"constant_bias": bias_regr["pvalue"].ge(alpha),
"homoscedastic": loa_regr["pvalue"].ge(alpha),
}
)
########################################################################
# Setting attributes
########################################################################
# Merge the parametric and regression values for Bias and LoA
regr_vals = regr.unstack(0)[["slope", "intercept"]]
regr_vals.columns = regr_vals.columns.swaplevel().map("_".join)
vals = parm_vals.join(regr_vals).rename_axis("variable", axis=1)
# Merge the two CI dataframes for easier access
regr_ci = regr_ci.unstack(0)
regr_ci.columns = regr_ci.columns.swaplevel().map("_".join)
ci = parm_ci.join(regr_ci)
ci.columns = pd.MultiIndex.from_tuples(
tuples=ci.columns.str.split("-", expand=True),
names=["variable", "interval"],
)
empty_df = pd.DataFrame().reindex_like(ci)
ci = pd.concat({"parm": ci, "boot": empty_df}, names=["ci_method"], axis=1)
ci = ci.sort_index(axis=1) # Sort MultiIndex columns for cleanliness
# Set attributes
self._agreement = agreement
self._confidence = confidence
self._bootstrap_kwargs = bootstrap_kwargs
self._n_sessions = n_sessions
self._ref_scorer = ref_scorer
self._obs_scorer = obs_scorer
self._data = data
self._assumptions = assumptions
self._regr = regr
self._vals = vals
self._ci = ci
self._bias_method_opts = ["parm", "regr", "auto"]
self._loa_method_opts = ["parm", "regr", "auto"]
self._ci_method_opts = ["parm", "boot", "auto"]
@property
def ref_scorer(self):
"""The name of the reference scorer."""
return self._ref_scorer
@property
def obs_scorer(self):
"""The name of the observed scorer."""
return self._obs_scorer
@property
def n_sessions(self):
"""The number of sessions."""
return self._n_sessions
@property
def data(self):
"""A long-format :py:class:`pandas.DataFrame` containing all raw sleep statistics from
``ref_data`` and ``obs_data``.
"""
return self._data.drop(columns=["difference", "residuals", "residuals_abs"])
@property
def sleep_statistics(self):
"""Return a list of all sleep statistics included in the agreement analyses."""
return self.data.index.get_level_values("sleep_stat").unique().to_list()
@property
def assumptions(self):
"""A :py:class:`pandas.DataFrame` containing boolean values indicating the pass/fail status
of all statistical tests performed to test assumptions.
"""
return self._assumptions
@property
def auto_methods(self):
"""
A :py:class:`pandas.DataFrame` containing the methods applied when ``'auto'`` is selected.
"""
return pd.concat(
[
self.assumptions["constant_bias"].map({True: "parm", False: "regr"}).rename("bias"),
self.assumptions["homoscedastic"].map({True: "parm", False: "regr"}).rename("loa"),
self.assumptions["normal"].map({True: "parm", False: "boot"}).rename("ci"),
],
axis=1,
)
def __repr__(self):
# TODO v0.8: Keep only the text between < and >
return (
f"<SleepStatsAgreement | Observed scorer ('{self.obs_scorer}') evaluated against "
f"reference scorer ('{self.ref_scorer}'), {self.n_sessions} sleep sessions>\n"
" - Use `.summary()` to get a dataframe of bias and limits of agreement for each sleep "
"statistic\n"
" - Use `.plot_blandaltman()` to get a Bland-Altman-plot grid for sleep statistics\n"
"See the online documentation for more details."
)
def __str__(self):
return self.__repr__()
############################################################################
# Define some utility functions, mostly to aid with the use of df.apply and stats.bootstrap
############################################################################
@staticmethod
def _arr_to_loa(x, agreement):
"""Return a tuple with lower and upper limits of agreement."""
mean = np.mean(x)
bound = agreement * np.std(x, ddof=1)
return mean - bound, mean + bound
@staticmethod
def _linregr_dict(*args, **kwargs):
"""
A wrapper around :py:func:`scipy.stats.linregress` that returns a dictionary instead of a
named tuple. In the normally returned object, ``intercept_stderr`` is an extra field that is
not included when converting the named tuple, so this allows it to be included when using
something like groupby.
"""
regr = sps.linregress(*args, **kwargs)
return {
"slope": regr.slope,
"intercept": regr.intercept,
"rvalue": regr.rvalue,
"pvalue": regr.pvalue,
"stderr": regr.stderr,
"intercept_stderr": regr.intercept_stderr,
}
def _generate_bootstrap_ci(self, sleep_stats):
"""
Internal method to generate bootstrapped confidence intervals for bias and LoA.
This operates in-place by concatenating bootstrapped CIs to existing parametric CIs.
Note that parametric CIs are generated by default during init (bc they are quicker).
Parameters
----------
sleep_stats : list
A list of sleep statistics to bootstrap confidence intervals for.
"""
assert isinstance(sleep_stats, list), "`sleep_stats` must be a list"
assert len(sleep_stats) == len(set(sleep_stats)), "elements of `sleep_stats` must be unique"
assert all(
isinstance(ss, str) for ss in sleep_stats
), "all elements of `sleep_stats` must be strings"
assert all(
ss in self.sleep_statistics for ss in sleep_stats
), f"all elements of `sleep_stats` must be one of {self.sleep_statistics}"
# Update bootstrap keyword arguments with defaults
bs_kwargs = {
"n_resamples": 1000,
"method": "BCa",
"confidence_level": self._confidence, # should not change from parametric level
"vectorized": False, # should stay False, bc of how the custom get_vars function works
"paired": True, # should stay True, especially if method is BCa
} | self._bootstrap_kwargs
def get_vars(ref_arr, diff_arr, rabs_arr):
"""A function to get all variables at once and avoid redundant stats.bootstrap calls."""
bias_parm = np.mean(diff_arr)
lloa_parm, uloa_parm = self._arr_to_loa(diff_arr, self._agreement)
bias_slope, bias_inter = sps.linregress(ref_arr, diff_arr)[:2]
# Note this is NOT recalculating residuals each time for the next regression
loa_slope, loa_inter = sps.linregress(ref_arr, rabs_arr)[:2]
return bias_parm, lloa_parm, uloa_parm, bias_inter, bias_slope, loa_inter, loa_slope
# !! Column order MUST match the order of arrays boot_stats expects as INPUT
# !! Variable order MUST match the order of floats boot_stats returns as OUTPUT
interval_order = ["lower", "upper"]
column_order = ["Reference", "difference", "residuals_abs"]
variable_order = [
"bias_parm",
"lloa_parm",
"uloa_parm",
"bias_intercept",
"bias_slope",
"loa_intercept",
"loa_slope",
]
boot_ci = (
self._data.loc[
sleep_stats, column_order
] # Extract the relevant sleep stats and columns
.groupby("sleep_stat") # Group so the bootstrapping is applied once to each sleep stat
# Apply the bootstrap function, where tuple(df.to_numpy().T) convert the 3 columns
# of the passed dataframe to a tuple of 3 1D arrays
.apply(lambda df: sps.bootstrap(tuple(df.to_numpy().T), get_vars, **bs_kwargs))
.map(lambda res: res.confidence_interval) # Pull high/low CIs out of the results object
.explode() # Break high and low CIs into separate rows
.to_frame("value") # Convert to dataframe and name column
.assign(interval=interval_order * len(sleep_stats)) # Add a column indicating interval
.explode("value") # Break low CI variables and high CI variables out of arrays
.assign(variable=variable_order * len(sleep_stats) * 2) # Add column indicating variabl
.pivot(columns=["variable", "interval"], values="value") # Go long to wide format
.sort_index(axis=1) # Sort MultiIndex columns for cleanliness
)
# Merge with existing CI dataframe
self._ci["boot"] = self._ci["boot"].fillna(boot_ci)
[docs] def get_table(self, bias_method="auto", loa_method="auto", ci_method="auto", fstrings={}):
"""
Return a :py:class:`~pandas.DataFrame` with bias, loa, bias_ci, loa_ci as string equations.
Parameters
----------
bias_method : str
If ``'parm'`` (i.e., parametric), bias is always represented as the mean difference
(observed minus reference).
If ``'regr'`` (i.e., regression), bias is always represented as a regression equation.
If ``'auto'`` (default), bias is represented as a regression equation for sleep
statistics where the score differences are proportionally biased and as the mean
difference otherwise.
loa_method : str
If ``'parm'`` (i.e., parametric), limits of agreement are always represented as
bias +/- 1.96 standard deviations (where 1.96 can be adjusted through the ``agreement``
parameter).
If ``'regr'`` (i.e., regression), limits of agreement are always represented as a
regression equation.
If ``'auto'`` (default), limits of agreement are represented as a regression equation
for sleep statistics where the score differences are proportionally biased and as
bias +/- 1.96 standard deviation otherwise.
ci_method : str
If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
standard t-distribution.
If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
bootstrap resampling procedure.
If ``'auto'`` (default), confidence intervals are represented using a bootstrap
resampling procedure for sleep statistics where the distribution of score differences is
non-normal and using a standard t-distribution otherwise.
fstrings : dict
Optional custom strings for formatting cells.
Returns
-------
table : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
and their confidence intervals for all sleep statistics.
"""
assert isinstance(bias_method, str), "`bias_method` must be a string"
assert (
bias_method in self._bias_method_opts
), f"`bias_method` must be one of {self._bias_method_opts}"
assert isinstance(loa_method, str), "`loa_method` must be a string"
assert (
loa_method in self._loa_method_opts
), f"`loa_method` must be one of {self._loa_method_opts}"
assert isinstance(fstrings, dict), "`fstrings` must be a dictionary"
# Agreement gets adjusted when LoA is modeled
loa_regr_agreement = self._agreement * np.sqrt(np.pi / 2)
if not fstrings:
fstrings = {
"bias_parm": "{bias_parm_center:.2f}",
"bias_regr": "{bias_intercept_center:.2f} + {bias_slope_center:.2f}x",
"loa_parm": "{lloa_parm_center:.2f}, {uloa_parm_center:.2f}",
"loa_regr": (
"Bias \u00B1 {loa_regr_agreement:.2f} "
"* ({loa_intercept_center:.2f} + {loa_slope_center:.2f}x)"
),
"bias_parm_ci": ("[{bias_parm_lower:.2f}, {bias_parm_upper:.2f}]"),
"bias_regr_ci": (
"[{bias_intercept_lower:.2f}, {bias_intercept_upper:.2f}], "
"[{bias_slope_lower:.2f}, {bias_slope_upper:.2f}]"
),
"loa_parm_ci": (
"[{lloa_parm_lower:.2f}, {lloa_parm_upper:.2f}], "
"[{uloa_parm_lower:.2f}, {uloa_parm_upper:.2f}]"
),
"loa_regr_ci": (
"[{loa_intercept_lower:.2f}, {loa_intercept_upper:.2f}], "
"[{loa_slope_lower:.2f}, {loa_slope_upper:.2f}]"
),
}
values = self.summary(ci_method=ci_method)
values.columns = values.columns.map("_".join) # Convert MultiIndex columns to Index
# Add a column of regr agreement so it can be used as variable
values["loa_regr_agreement"] = loa_regr_agreement
def format_all_str(row, fstrings_dict):
return {var: fstr.format(**row) for var, fstr in fstrings_dict.items()}
all_strings = values.apply(format_all_str, fstrings_dict=fstrings, axis=1).apply(pd.Series)
if bias_method == "auto":
bias_parm_idx = self.auto_methods.query("bias == 'parm'").index.tolist()
elif bias_method == "parm":
bias_parm_idx = self.sleep_statistics
elif bias_method == "regr":
bias_parm_idx = []
if loa_method == "auto":
loa_parm_idx = self.auto_methods.query("loa == 'parm'").index.tolist()
elif loa_method == "parm":
loa_parm_idx = self.sleep_statistics
elif loa_method == "regr":
loa_parm_idx = []
bias_regr_idx = [ss for ss in self.sleep_statistics if ss not in bias_parm_idx]
loa_regr_idx = [ss for ss in self.sleep_statistics if ss not in loa_parm_idx]
bias_parm = all_strings.loc[bias_parm_idx, ["bias_parm", "bias_parm_ci"]]
bias_regr = all_strings.loc[bias_regr_idx, ["bias_regr", "bias_regr_ci"]]
bias_parm.columns = bias_parm.columns.str.replace("_parm", "")
bias_regr.columns = bias_parm.columns.str.replace("_regr", "")
bias = pd.concat([bias_parm, bias_regr])
loa_parm = all_strings.loc[loa_parm_idx, ["loa_parm", "loa_parm_ci"]]
loa_regr = all_strings.loc[loa_regr_idx, ["loa_regr", "loa_regr_ci"]]
loa_parm.columns = loa_parm.columns.str.replace("_parm", "")
loa_regr.columns = loa_regr.columns.str.replace("_regr", "")
loa = pd.concat([loa_parm, loa_regr])
table = bias.join(loa, validate="1:1").sort_index(axis=0)
return table
[docs] def summary(self, ci_method="auto"):
"""
Return a :py:class:`~pandas.DataFrame` that includes all calculated metrics:
* Parametric bias
* Parametric lower and upper limits of agreement
* Regression intercept and slope for modeled bias
* Regression intercept and slope for modeled limits of agreement
* Lower and upper confidence intervals for all metrics
Parameters
----------
ci_method : str
If ``'parm'`` (i.e., parametric), confidence intervals are always represented using a
standard t-distribution.
If ``'boot'`` (i.e., bootstrap), confidence intervals are always represented using a
bootstrap resampling procedure.
If ``'auto'`` (default), confidence intervals are represented using a bootstrap
resampling procedure for sleep statistics where the distribution of score differences is
non-normal and using a standard t-distribution otherwise.
Returns
-------
summary : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` of string representations for bias, limits of agreement,
and their confidence intervals for all sleep statistics.
"""
assert isinstance(ci_method, str), "`ci_method` must be a string"
assert ci_method in self._ci_method_opts, f"`ci_method` must be in {self._ci_method_opts}"
# Make sure relevant sleep statistics have bootstrapped CIs, and generate them if not
if ci_method in ["boot", "auto"]:
if ci_method == "boot":
sleep_stats_to_boot = self.sleep_statistics
elif ci_method == "auto":
sleep_stats_to_boot = self.auto_methods.query("ci == 'boot'").index.tolist()
# Remove any sleep stats already bootstrapped CIs (eg if "boot" is callaed after "auto")
sleep_stats_booted = self._ci["boot"].dropna().index
sleep_stats_to_boot = [s for s in sleep_stats_to_boot if s not in sleep_stats_booted]
if sleep_stats_to_boot:
self._generate_bootstrap_ci(sleep_stats=sleep_stats_to_boot)
if ci_method == "auto":
parm_idx = self.auto_methods.query("ci == 'parm'").index.to_list()
boot_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
parm_vals = self._ci.loc[parm_idx, "parm"]
boot_vals = self._ci.loc[boot_idx, "boot"]
ci_vals = pd.concat([parm_vals, boot_vals])
else:
ci_vals = self._ci[ci_method]
# Add an extra level to values columns, indicating they are the center interval
centr_vals = pd.concat({"center": self._vals}, names=["interval"], axis=1).swaplevel(axis=1)
summary = centr_vals.join(ci_vals, how="left", validate="1:1").astype(float)
return summary.sort_index(axis=1)
[docs] def calibrate(self, data, bias_method="auto", adjust_all=False):
"""
Calibrate a :py:class:`~pandas.DataFrame` of sleep statistics from a new scorer based on
observed biases in ``obs_data``/``obs_scorer``.
Parameters
----------
data : :py:class:`pandas.DataFrame`
A :py:class:`pandas.DataFrame` with sleep statistics from an observed scorer.
Rows are unique observations and columns are unique sleep statistics.
bias_method : str
If ``'parm'``, sleep statistics are always adjusted based on parametric bias.
If ``'regr'``, sleep statistics are always adjusted based on regression-modeled bias.
If ``'auto'`` (default), bias sleep statistics are adjusted by either ``'parm'`` or
``'regr'``, depending on assumption violations.
.. seealso:: :py:meth:`~yasa.SleepStatsAgreement.summary`
adjust_all: bool
If False (default), only adjust values for sleep statistics that showed a statistically
significant bias in the ``obs_data``. If True, adjust values for all sleep statistics.
Returns
-------
calibrated_data : :py:class:`pandas.DataFrame`
A :py:class:`~pandas.DataFrame` with calibrated sleep statistics.
.. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
"""
assert isinstance(data, pd.DataFrame), "`data` must be a pandas DataFrame"
assert all(
col in self.sleep_statistics for col in data
), f"all columns of `data` must be valid sleep statistics: {self.sleep_statistics}"
assert isinstance(bias_method, str), "`bias_method` must be a string"
assert (
bias_method in self._bias_method_opts
), f"`bias_method` must be one of {self._bias_method_opts}"
assert isinstance(adjust_all, bool), "`adjust_all` must be True or False"
parm_adjusted = data + self._vals["bias_parm"]
regr_adjusted = data * self._vals["bias_slope"] + self._vals["bias_intercept"]
if bias_method == "parm":
calibrated_data = parm_adjusted
elif bias_method == "regr":
calibrated_data = regr_adjusted
elif bias_method == "auto":
parm_idx = self.auto_methods.query("bias == 'parm'").index.to_list()
regr_idx = [ss for ss in self.sleep_statistics if ss not in parm_idx]
calibrated_data = parm_adjusted[parm_idx].join(regr_adjusted[regr_idx]).dropna(axis=1)
if not adjust_all:
# Put the raw values back for sleep stats that don't show statistical bias
unbiased_sstats = self.assumptions.query("unbiased == True").index.to_list()
calibrated_data[unbiased_sstats] = data[unbiased_sstats]
return calibrated_data
[docs] def get_calibration_func(self, sleep_stat):
"""
Return a function for calibrating a specific sleep statistic, based on observed biases in
``obs_data``/``obs_scorer``.
.. seealso:: :py:meth:`~yasa.SleepStatsAgreement.calibrate`
Examples
--------
>>> ssa = yasa.SleepStatsAgreement(...)
>>> calibrate_rem = ssa.get_calibration_func("REM")
>>> new_obs_rem_vals = np.array([50, 40, 30, 20])
>>> calibrate_rem(new_obs_rem_vals)
>>> calibrate_rem(new_obs_rem_vals)
array([50, 40, 30, 20])
>>> calibrate_rem(new_obs_rem_vals, bias_test=False)
array([42.825, 32.825, 22.825, 12.825])
>>> calibrate_rem(new_obs_rem_vals, bias_test=False, method="regr")
array([ -9.33878878, -9.86815607, -10.39752335, -10.92689064])
"""
assert isinstance(sleep_stat, str), "`sleep_stat` must be a string"
assert sleep_stat in self.sleep_statistics, "`sleep_stat` must be a valid sleep statistic"
columns = ["bias_parm", "bias_slope", "bias_intercept"]
parm, slope, intercept = self._vals.loc[sleep_stat, columns]
auto_method = self.auto_methods.at[sleep_stat, "bias"]
not_biased = self.assumptions.at[sleep_stat, "unbiased"]
def calibration_func(x, method="auto", adjust_all=False):
"""Calibrate values for sleep statistic.
Parameters
----------
x : array
Values to be calibrated
method: str
Method of bias calculation for calibration (``'parm'``, ``'regr'``, or ``'auto'``).
adjust_all : bool
If False, only adjust sleep stat if observed bias was statistically significant.
Returns
-------
x_calibrated : :py:class:`numpy.array`
An array of calibrated x values.
"""
x = np.asarray(x)
method = auto_method if method == "auto" else method
if not_biased and not adjust_all: # Return input if sleep stat is not statstclly biased
return x
elif method == "parm":
return x + parm
elif method == "regr":
return x * slope + intercept
return calibration_func