Implement yli.auto_descriptives
This commit is contained in:
parent
c3eef0efea
commit
e268f385be
13
docs/descriptives.rst
Normal file
13
docs/descriptives.rst
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Descriptive statistics
|
||||||
|
======================
|
||||||
|
|
||||||
|
Functions
|
||||||
|
---------
|
||||||
|
|
||||||
|
.. autofunction:: yli.auto_descriptives
|
||||||
|
|
||||||
|
Result classes
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. autoclass:: yli.descriptives.AutoDescriptivesResult
|
||||||
|
:members:
|
@ -6,6 +6,7 @@ scipy-yli API reference
|
|||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
general.rst
|
general.rst
|
||||||
|
descriptives.rst
|
||||||
sig_tests.rst
|
sig_tests.rst
|
||||||
regress.rst
|
regress.rst
|
||||||
io.rst
|
io.rst
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
from .bayes_factors import bayesfactor_afbf
|
from .bayes_factors import bayesfactor_afbf
|
||||||
from .config import config
|
from .config import config
|
||||||
|
from .descriptives import auto_descriptives
|
||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import PenalisedLogit, logit_then_regress, regress, vif
|
from .regress import PenalisedLogit, logit_then_regress, regress, vif
|
||||||
|
140
yli/descriptives.py
Normal file
140
yli/descriptives.py
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
|
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from .config import config
|
||||||
|
from .utils import check_nan
|
||||||
|
|
||||||
|
def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
||||||
|
"""
|
||||||
|
Automatically compute descriptive summary statistics
|
||||||
|
|
||||||
|
The statistics computed are:
|
||||||
|
|
||||||
|
* For a categorical variable – Counts of values
|
||||||
|
* For a continuous variable – Mean and standard deviation
|
||||||
|
* For an ordinal variable – Median and range or IQR
|
||||||
|
|
||||||
|
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
|
||||||
|
|
||||||
|
:param df: Data to summarise
|
||||||
|
:type df: DataFrame
|
||||||
|
:param cols: Columns in *df* for the variables to summarise
|
||||||
|
:type cols: List[str]
|
||||||
|
:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
|
||||||
|
:type ordinal_range: List[str]
|
||||||
|
:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
|
||||||
|
:type ordinal_iqr: List[str]
|
||||||
|
|
||||||
|
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
|
||||||
|
"""
|
||||||
|
|
||||||
|
result_data = []
|
||||||
|
result_labels = []
|
||||||
|
|
||||||
|
for col in cols:
|
||||||
|
data_cleaned = df[col].dropna()
|
||||||
|
|
||||||
|
if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
|
||||||
|
# Categorical data
|
||||||
|
values = sorted(data_cleaned.unique())
|
||||||
|
|
||||||
|
# Value counts
|
||||||
|
result_labels.append((
|
||||||
|
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
|
||||||
|
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
':'.join(str((data_cleaned == v).sum()) for v in values),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||||
|
if col in ordinal_range:
|
||||||
|
# Ordinal data (report range)
|
||||||
|
result_labels.append((
|
||||||
|
'{}, median (range)'.format(col),
|
||||||
|
'{}, median (range)'.format(col),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
elif col in ordinal_iqr:
|
||||||
|
# Ordinal data (report IQR)
|
||||||
|
result_labels.append((
|
||||||
|
'{}, median (IQR)'.format(col),
|
||||||
|
'{}, median (IQR)'.format(col),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Continuous data
|
||||||
|
result_labels.append((
|
||||||
|
'{}, μ (SD)'.format(col),
|
||||||
|
'{}, <i>μ</i> (SD)'.format(col),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
|
||||||
|
|
||||||
|
return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
|
||||||
|
|
||||||
|
class AutoDescriptivesResult:
|
||||||
|
"""
|
||||||
|
Result of automatically computed descriptive summary statistics
|
||||||
|
|
||||||
|
See :func:`yli.auto_descriptives`.
|
||||||
|
|
||||||
|
Results data stored within instances of this class is not intended to be directly accessed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, result_data, result_labels):
|
||||||
|
# List of tuples (variable summary, missing count)
|
||||||
|
self._result_data = result_data
|
||||||
|
# List of tuples (plaintext label, HTML label)
|
||||||
|
self._result_labels = result_labels
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
if config.repr_is_summary:
|
||||||
|
return self.summary()
|
||||||
|
return super().__repr__()
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
|
||||||
|
|
||||||
|
for data, label in zip(self._result_data, self._result_labels):
|
||||||
|
result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
|
||||||
|
|
||||||
|
result += '</tbody></table>'
|
||||||
|
return result
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the tests of association
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Format data for output
|
||||||
|
result_labels_fmt = [r[0] for r in self._result_labels]
|
||||||
|
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
|
||||||
|
return str(table)
|
@ -722,7 +722,7 @@ class AutoBinaryResult:
|
|||||||
|
|
||||||
# List of tuples (first group summary, second group summary, test result)
|
# List of tuples (first group summary, second group summary, test result)
|
||||||
self._result_data = result_data
|
self._result_data = result_data
|
||||||
# List of row labels for the independente variables
|
# List of tuples (plaintext label, HTML label)
|
||||||
self._result_labels = result_labels
|
self._result_labels = result_labels
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user