Implement yli.auto_descriptives

2022-11-10 21:20:06 +11:00 · 2022-11-10 21:20:06 +11:00 · e268f385be
commit e268f385be
parent c3eef0efea
5 changed files with 156 additions and 1 deletions
--- a/docs/descriptives.rst
+++ b/docs/descriptives.rst
@ -0,0 +1,13 @@
 Descriptive statistics
 ======================
 Functions
 ---------
 .. autofunction:: yli.auto_descriptives
 Result classes
 --------------
 .. autoclass:: yli.descriptives.AutoDescriptivesResult
 	:members:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -6,6 +6,7 @@ scipy-yli API reference
 	:caption: Contents:
 	general.rst
 	descriptives.rst
 	sig_tests.rst
 	regress.rst
 	io.rst
--- a/yli/init.py
+++ b/yli/init.py
@ -16,6 +16,7 @@
 from .bayes_factors import bayesfactor_afbf
 from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
 from .regress import PenalisedLogit, logit_then_regress, regress, vif
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -0,0 +1,140 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
 #   Copyright © 2022  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
 #   the Free Software Foundation, either version 3 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU Affero General Public License for more details.
 #
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 import pandas as pd
 from .config import config
 from .utils import check_nan
 def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 	"""
 	Automatically compute descriptive summary statistics
 	The statistics computed are:
 	* For a categorical variable – Counts of values
 	* For a continuous variable – Mean and standard deviation
 	* For an ordinal variable – Median and range or IQR
 	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
 	:param df: Data to summarise
 	:type df: DataFrame
 	:param cols: Columns in *df* for the variables to summarise
 	:type cols: List[str]
 	:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
 	:type ordinal_range: List[str]
 	:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
 	:type ordinal_iqr: List[str]
 	:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
 	"""
 	result_data = []
 	result_labels = []
 	for col in cols:
 		data_cleaned = df[col].dropna()
 		if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
 			# Categorical data
 			values = sorted(data_cleaned.unique())
 			# Value counts
 			result_labels.append((
 				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
 				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
 			))
 			result_data.append((
 				':'.join(str((data_cleaned == v).sum()) for v in values),
 				len(df) - len(data_cleaned)
 			))
 		elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
 			if col in ordinal_range:
 				# Ordinal data (report range)
 				result_labels.append((
 					'{}, median (range)'.format(col),
 					'{}, median (range)'.format(col),
 				))
 				result_data.append((
 					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
 					len(df) - len(data_cleaned)
 				))
 			elif col in ordinal_iqr:
 				# Ordinal data (report IQR)
 				result_labels.append((
 					'{}, median (IQR)'.format(col),
 					'{}, median (IQR)'.format(col),
 				))
 				result_data.append((
 					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
 					len(df) - len(data_cleaned)
 				))
 			else:
 				# Continuous data
 				result_labels.append((
 					'{}, μ (SD)'.format(col),
 					'{}, <i>μ</i> (SD)'.format(col),
 				))
 				result_data.append((
 					'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
 					len(df) - len(data_cleaned)
 				))
 		else:
 			raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
 	return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
 class AutoDescriptivesResult:
 	"""
 	Result of automatically computed descriptive summary statistics
 	See :func:`yli.auto_descriptives`.
 	Results data stored within instances of this class is not intended to be directly accessed.
 	"""
 	def __init__(self, *, result_data, result_labels):
 		# List of tuples (variable summary, missing count)
 		self._result_data = result_data
 		# List of tuples (plaintext label, HTML label)
 		self._result_labels = result_labels
 	def __repr__(self):
 		if config.repr_is_summary:
 			return self.summary()
 		return super().__repr__()
 	def _repr_html_(self):
 		result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
 		for data, label in zip(self._result_data, self._result_labels):
 			result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
 		result += '</tbody></table>'
 		return result
 	def summary(self):
 		"""
 		Return a stringified summary of the tests of association
 		:rtype: str
 		"""
 		# Format data for output
 		result_labels_fmt = [r[0] for r in self._result_labels]
 		table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
 		return str(table)
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -722,7 +722,7 @@ class AutoBinaryResult:
 		# List of tuples (first group summary, second group summary, test result)
 		self._result_data = result_data
-		# List of row labels for the independente variables
+		# List of tuples (plaintext label, HTML label)
 		self._result_labels = result_labels
 	def __repr__(self):