diff --git a/docs/descriptives.rst b/docs/descriptives.rst
new file mode 100644
index 0000000..d6aaf0f
--- /dev/null
+++ b/docs/descriptives.rst
@@ -0,0 +1,13 @@
+Descriptive statistics
+======================
+
+Functions
+---------
+
+.. autofunction:: yli.auto_descriptives
+
+Result classes
+--------------
+
+.. autoclass:: yli.descriptives.AutoDescriptivesResult
+ :members:
diff --git a/docs/index.rst b/docs/index.rst
index 0ae28de..9dd97b7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,6 +6,7 @@ scipy-yli API reference
:caption: Contents:
general.rst
+ descriptives.rst
sig_tests.rst
regress.rst
io.rst
diff --git a/yli/__init__.py b/yli/__init__.py
index 89415d7..476f901 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -16,6 +16,7 @@
from .bayes_factors import bayesfactor_afbf
from .config import config
+from .descriptives import auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import PenalisedLogit, logit_then_regress, regress, vif
diff --git a/yli/descriptives.py b/yli/descriptives.py
new file mode 100644
index 0000000..eeb9a8e
--- /dev/null
+++ b/yli/descriptives.py
@@ -0,0 +1,140 @@
+# scipy-yli: Helpful SciPy utilities and recipes
+# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+import pandas as pd
+
+from .config import config
+from .utils import check_nan
+
+def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
+ """
+ Automatically compute descriptive summary statistics
+
+ The statistics computed are:
+
+ * For a categorical variable – Counts of values
+ * For a continuous variable – Mean and standard deviation
+ * For an ordinal variable – Median and range or IQR
+
+ There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
+
+ :param df: Data to summarise
+ :type df: DataFrame
+ :param cols: Columns in *df* for the variables to summarise
+ :type cols: List[str]
+ :param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
+ :type ordinal_range: List[str]
+ :param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
+ :type ordinal_iqr: List[str]
+
+ :rtype: :class:`yli.descriptives.AutoDescriptivesResult`
+ """
+
+ result_data = []
+ result_labels = []
+
+ for col in cols:
+ data_cleaned = df[col].dropna()
+
+ if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
+ # Categorical data
+ values = sorted(data_cleaned.unique())
+
+ # Value counts
+ result_labels.append((
+ '{}, {}'.format(col, ':'.join(str(v) for v in values)),
+ '{}, {}'.format(col, ':'.join(str(v) for v in values)),
+ ))
+ result_data.append((
+ ':'.join(str((data_cleaned == v).sum()) for v in values),
+ len(df) - len(data_cleaned)
+ ))
+ elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+ if col in ordinal_range:
+ # Ordinal data (report range)
+ result_labels.append((
+ '{}, median (range)'.format(col),
+ '{}, median (range)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
+ len(df) - len(data_cleaned)
+ ))
+ elif col in ordinal_iqr:
+ # Ordinal data (report IQR)
+ result_labels.append((
+ '{}, median (IQR)'.format(col),
+ '{}, median (IQR)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
+ len(df) - len(data_cleaned)
+ ))
+ else:
+ # Continuous data
+ result_labels.append((
+ '{}, μ (SD)'.format(col),
+ '{}, μ (SD)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
+ len(df) - len(data_cleaned)
+ ))
+ else:
+ raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
+
+ return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
+
+class AutoDescriptivesResult:
+ """
+ Result of automatically computed descriptive summary statistics
+
+ See :func:`yli.auto_descriptives`.
+
+ Results data stored within instances of this class is not intended to be directly accessed.
+ """
+
+ def __init__(self, *, result_data, result_labels):
+ # List of tuples (variable summary, missing count)
+ self._result_data = result_data
+ # List of tuples (plaintext label, HTML label)
+ self._result_labels = result_labels
+
+ def __repr__(self):
+ if config.repr_is_summary:
+ return self.summary()
+ return super().__repr__()
+
+ def _repr_html_(self):
+ result = '
| | Missing |
'
+
+ for data, label in zip(self._result_data, self._result_labels):
+ result += '{} | {} | {} |
'.format(label[1], data[0], data[1])
+
+ result += '
'
+ return result
+
+ def summary(self):
+ """
+ Return a stringified summary of the tests of association
+
+ :rtype: str
+ """
+
+ # Format data for output
+ result_labels_fmt = [r[0] for r in self._result_labels]
+ table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
+ return str(table)
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index 528988b..364f63e 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -722,7 +722,7 @@ class AutoBinaryResult:
# List of tuples (first group summary, second group summary, test result)
self._result_data = result_data
- # List of row labels for the independente variables
+ # List of tuples (plaintext label, HTML label)
self._result_labels = result_labels
def __repr__(self):