From e268f385be712fb4cbe2d218b8939e83f8f1116c Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Thu, 10 Nov 2022 21:20:06 +1100
Subject: [PATCH] Implement yli.auto_descriptives

---
 docs/descriptives.rst |  13 ++++
 docs/index.rst        |   1 +
 yli/__init__.py       |   1 +
 yli/descriptives.py   | 140 ++++++++++++++++++++++++++++++++++++++++++
 yli/sig_tests.py      |   2 +-
 5 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 docs/descriptives.rst
 create mode 100644 yli/descriptives.py

diff --git a/docs/descriptives.rst b/docs/descriptives.rst
new file mode 100644
index 0000000..d6aaf0f
--- /dev/null
+++ b/docs/descriptives.rst
@@ -0,0 +1,13 @@
+Descriptive statistics
+======================
+
+Functions
+---------
+
+.. autofunction:: yli.auto_descriptives
+
+Result classes
+--------------
+
+.. autoclass:: yli.descriptives.AutoDescriptivesResult
+	:members:
diff --git a/docs/index.rst b/docs/index.rst
index 0ae28de..9dd97b7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,6 +6,7 @@ scipy-yli API reference
 	:caption: Contents:
 	
 	general.rst
+	descriptives.rst
 	sig_tests.rst
 	regress.rst
 	io.rst
diff --git a/yli/__init__.py b/yli/__init__.py
index 89415d7..476f901 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -16,6 +16,7 @@
 
 from .bayes_factors import bayesfactor_afbf
 from .config import config
+from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
 from .regress import PenalisedLogit, logit_then_regress, regress, vif
diff --git a/yli/descriptives.py b/yli/descriptives.py
new file mode 100644
index 0000000..eeb9a8e
--- /dev/null
+++ b/yli/descriptives.py
@@ -0,0 +1,140 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import pandas as pd
+
+from .config import config
+from .utils import check_nan
+
+def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
+	"""
+	Automatically compute descriptive summary statistics
+	
+	The statistics computed are:
+	
+	* For a categorical variable – Counts of values
+	* For a continuous variable – Mean and standard deviation
+	* For an ordinal variable – Median and range or IQR
+	
+	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
+	
+	:param df: Data to summarise
+	:type df: DataFrame
+	:param cols: Columns in *df* for the variables to summarise
+	:type cols: List[str]
+	:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
+	:type ordinal_range: List[str]
+	:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
+	:type ordinal_iqr: List[str]
+	
+	:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
+	"""
+	
+	result_data = []
+	result_labels = []
+	
+	for col in cols:
+		data_cleaned = df[col].dropna()
+		
+		if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
+			# Categorical data
+			values = sorted(data_cleaned.unique())
+			
+			# Value counts
+			result_labels.append((
+				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
+				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
+			))
+			result_data.append((
+				':'.join(str((data_cleaned == v).sum()) for v in values),
+				len(df) - len(data_cleaned)
+			))
+		elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+			if col in ordinal_range:
+				# Ordinal data (report range)
+				result_labels.append((
+					'{}, median (range)'.format(col),
+					'{}, median (range)'.format(col),
+				))
+				result_data.append((
+					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
+					len(df) - len(data_cleaned)
+				))
+			elif col in ordinal_iqr:
+				# Ordinal data (report IQR)
+				result_labels.append((
+					'{}, median (IQR)'.format(col),
+					'{}, median (IQR)'.format(col),
+				))
+				result_data.append((
+					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
+					len(df) - len(data_cleaned)
+				))
+			else:
+				# Continuous data
+				result_labels.append((
+					'{}, μ (SD)'.format(col),
+					'{}, <i>μ</i> (SD)'.format(col),
+				))
+				result_data.append((
+					'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
+					len(df) - len(data_cleaned)
+				))
+		else:
+			raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
+	
+	return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
+
+class AutoDescriptivesResult:
+	"""
+	Result of automatically computed descriptive summary statistics
+	
+	See :func:`yli.auto_descriptives`.
+	
+	Results data stored within instances of this class is not intended to be directly accessed.
+	"""
+	
+	def __init__(self, *, result_data, result_labels):
+		# List of tuples (variable summary, missing count)
+		self._result_data = result_data
+		# List of tuples (plaintext label, HTML label)
+		self._result_labels = result_labels
+	
+	def __repr__(self):
+		if config.repr_is_summary:
+			return self.summary()
+		return super().__repr__()
+	
+	def _repr_html_(self):
+		result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
+		
+		for data, label in zip(self._result_data, self._result_labels):
+			result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
+		
+		result += '</tbody></table>'
+		return result
+	
+	def summary(self):
+		"""
+		Return a stringified summary of the tests of association
+		
+		:rtype: str
+		"""
+		
+		# Format data for output
+		result_labels_fmt = [r[0] for r in self._result_labels]
+		table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
+		return str(table)
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index 528988b..364f63e 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -722,7 +722,7 @@ class AutoBinaryResult:
 		
 		# List of tuples (first group summary, second group summary, test result)
 		self._result_data = result_data
-		# List of row labels for the independente variables
+		# List of tuples (plaintext label, HTML label)
 		self._result_labels = result_labels
 	
 	def __repr__(self):