scipy-yli/yli/descriptives.py

#   scipy-yli: Helpful SciPy utilities and recipes
#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import pandas as pd
from scipy import stats
import seaborn as sns

from .config import config
from .utils import as_numeric, check_nan

def auto_descriptives(df, cols, *, ordinal_range=[]):
	"""
	Automatically compute descriptive summary statistics
	
	The statistics computed are:
	
	* For a categorical variable – Counts of values
	* For a continuous variable – Mean and standard deviation
	* For an ordinal variable – Median and IQR (default) or range
	
	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
	
	:param df: Data to summarise
	:type df: DataFrame
	:param cols: Columns in *df* for the variables to summarise
	:type cols: List[str]
	:param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
	:type ordinal_range: List[str]
	
	:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
	"""
	
	result_data = []
	result_labels = []
	
	for col in cols:
		data_cleaned = df[col].dropna()
		
		if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
			# Ordinal numeric data
			data_cleaned = data_cleaned.astype('float64')
			
			if col in ordinal_range:
				# Report range
				result_labels.append((
					'{}, median (range)'.format(col),
					'{}, median (range)'.format(col),
				))
				result_data.append((
					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
					len(df) - len(data_cleaned)
				))
			else:
				# Report IQR
				result_labels.append((
					'{}, median (IQR)'.format(col),
					'{}, median (IQR)'.format(col),
				))
				result_data.append((
					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
					len(df) - len(data_cleaned)
				))
		elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
			# Categorical data
			# FIXME: Sort order
			values = sorted(data_cleaned.unique())
			
			# Value counts
			result_labels.append((
				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
			))
			result_data.append((
				':'.join(str((data_cleaned == v).sum()) for v in values),
				len(df) - len(data_cleaned)
			))
		elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
			# Continuous data
			result_labels.append((
				'{}, μ (SD)'.format(col),
				'{}, <i>μ</i> (SD)'.format(col),
			))
			result_data.append((
				'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
				len(df) - len(data_cleaned)
			))
		else:
			raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
	
	return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)

class AutoDescriptivesResult:
	"""
	Result of automatically computed descriptive summary statistics
	
	See :func:`yli.auto_descriptives`.
	
	Results data stored within instances of this class is not intended to be directly accessed.
	"""
	
	def __init__(self, *, result_data, result_labels):
		# List of tuples (variable summary, missing count)
		self._result_data = result_data
		# List of tuples (plaintext label, HTML label)
		self._result_labels = result_labels
	
	def __repr__(self):
		if config.repr_is_summary:
			return self.summary()
		return super().__repr__()
	
	def _repr_html_(self):
		result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
		
		for data, label in zip(self._result_data, self._result_labels):
			result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
		
		result += '</tbody></table>'
		return result
	
	def summary(self):
		"""
		Return a stringified summary of the tests of association
		
		:rtype: str
		"""
		
		# Format data for output
		result_labels_fmt = [r[0] for r in self._result_labels]
		table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
		return str(table)

def auto_correlations(df, cols):
	# TODO: Documentation
	
	def _col_to_numeric(col):
		if col.dtype == 'category' and col.cat.ordered:
			# Ordinal variable
			# Factorise if required
			col, _ = as_numeric(col)
			
			# Code as ranks
			col[col >= 0] = stats.rankdata(col[col >= 0])
			
			# Put NaNs back
			col = col.astype('float64')
			col[col < 0] = pd.NA
			
			return col
		else:
			# FIXME: Bools, binary, etc.
			return col
	
	# Code columns as numeric/ranks/etc. as appropriate
	df_coded = pd.DataFrame(index=df.index)

	for col_name in cols:
		col = df[col_name]
		
		if col.dtype == 'category' and col.cat.ordered:
			# Ordinal variable
			# Factorise if required
			col, _ = as_numeric(col)
			
			# Code as ranks
			col[col >= 0] = stats.rankdata(col[col >= 0])
			
			# Put NaNs back
			col = col.astype('float64')
			col[col < 0] = pd.NA
			
			df_coded[col_name] = col
		elif col.dtype in ('bool', 'boolean', 'category', 'object'):
			cat_values = col.dropna().unique()
			
			if len(cat_values) == 2:
				# Categorical variable with 2 categories
				# Code as 0/1/NA
				cat_values = sorted(cat_values)
				col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
				df_coded[col_name] = col
			else:
				# Categorical variable with >2 categories
				# Create dummy variables
				dummies = pd.get_dummies(col, prefix=col_name)
				df_coded = df_coded.join(dummies)
		else:
			# Numeric variable, etc.
			df_coded[col_name] = col
	
	# Compute pairwise correlation
	df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')
	
	for i, col1 in enumerate(df_coded.columns):
		for col2 in df_coded.columns[:i]:
			statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
			df_corr.loc[col1, col2] = statistic
			df_corr.loc[col2, col1] = statistic
		
		# Correlation with itself is always 1
		df_corr.loc[col1, col1] = 1
	
	return AutoCorrelationsResult(df_corr)

class AutoCorrelationsResult:
	# TODO: Documentation
	
	def __init__(self, correlations):
		self.correlations = correlations
	
	def __repr__(self):
		if config.repr_is_summary:
			return self.summary()
		return super().__repr__()
	
	def _repr_html_(self):
		df_repr = self.correlations._repr_html_()
		
		# Insert caption
		idx_endopen = df_repr.index('>', df_repr.index('<table'))
		df_repr = df_repr[:idx_endopen+1] + '<caption>Correlation Matrix</caption>' + df_repr[idx_endopen+1:]
		
		return df_repr
	
	def summary(self):
		"""
		Return a stringified summary of the correlation matrix
		
		:rtype: str
		"""
		
		return 'Correlation Matrix\n\n' + str(self.correlations)
	
	def plot(self):
		sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`# scipy-yli: Helpful SciPy utilities and recipes`
			`# Copyright © 2022 Lee Yingtong Li (RunasSudo)`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Affero General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Affero General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Affero General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`

			`import pandas as pd`
Implement yli.auto_correlations 2022-12-03 22:23:29 +11:00			`from scipy import stats`
			`import seaborn as sns`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00
			`from .config import config`
Implement yli.auto_correlations 2022-12-03 22:23:29 +11:00			`from .utils import as_numeric, check_nan`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`def auto_descriptives(df, cols, *, ordinal_range=[]):`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`"""`
			`Automatically compute descriptive summary statistics`

			`The statistics computed are:`

			`* For a categorical variable – Counts of values`
			`* For a continuous variable – Mean and standard deviation`
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`* For an ordinal variable – Median and IQR (default) or range`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00
			`There is no nan_policy argument. nan values are omitted from summary statistics for each variable, and the count of nan values is reported.`

			`:param df: Data to summarise`
			`:type df: DataFrame`
			`:param cols: Columns in df for the variables to summarise`
			`:type cols: List[str]`
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`:param ordinal_range: Columns of ordinal variables in df to report median and range (rather than IQR)`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`:type ordinal_range: List[str]`

			:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
			`"""`

			`result_data = []`
			`result_labels = []`

			`for col in cols:`
			`data_cleaned = df[col].dropna()`

In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):`
			`# Ordinal numeric data`
			`data_cleaned = data_cleaned.astype('float64')`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00
			`if col in ordinal_range:`
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`# Report range`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`result_labels.append((`
			`'{}, median (range)'.format(col),`
			`'{}, median (range)'.format(col),`
			`))`
			`result_data.append((`
			`'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),`
			`len(df) - len(data_cleaned)`
			`))`
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`else:`
			`# Report IQR`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`result_labels.append((`
			`'{}, median (IQR)'.format(col),`
			`'{}, median (IQR)'.format(col),`
			`))`
			`result_data.append((`
			`'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),`
			`len(df) - len(data_cleaned)`
			`))`
In auto_descriptives, autodetect ordinal variables based on category dtype 2022-12-03 22:19:24 +11:00			`elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):`
			`# Categorical data`
			`# FIXME: Sort order`
			`values = sorted(data_cleaned.unique())`

			`# Value counts`
			`result_labels.append((`
			`'{}, {}'.format(col, ':'.join(str(v) for v in values)),`
			`'{}, {}'.format(col, ':'.join(str(v) for v in values)),`
			`))`
			`result_data.append((`
			`':'.join(str((data_cleaned == v).sum()) for v in values),`
			`len(df) - len(data_cleaned)`
			`))`
			`elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):`
			`# Continuous data`
			`result_labels.append((`
			`'{}, μ (SD)'.format(col),`
			`'{}, <i>μ</i> (SD)'.format(col),`
			`))`
			`result_data.append((`
			`'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),`
			`len(df) - len(data_cleaned)`
			`))`
Implement yli.auto_descriptives 2022-11-10 21:20:06 +11:00			`else:`
			`raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))`

			`return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)`

			`class AutoDescriptivesResult:`
			`"""`
			`Result of automatically computed descriptive summary statistics`

			See :func:`yli.auto_descriptives`.

			`Results data stored within instances of this class is not intended to be directly accessed.`
			`"""`

			`def __init__(self, *, result_data, result_labels):`
			`# List of tuples (variable summary, missing count)`
			`self._result_data = result_data`
			`# List of tuples (plaintext label, HTML label)`
			`self._result_labels = result_labels`

			`def __repr__(self):`
			`if config.repr_is_summary:`
			`return self.summary()`
			`return super().__repr__()`

			`def _repr_html_(self):`
			`result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'`

			`for data, label in zip(self._result_data, self._result_labels):`
			`result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])`

			`result += '</tbody></table>'`
			`return result`

			`def summary(self):`
			`"""`
			`Return a stringified summary of the tests of association`

			`:rtype: str`
			`"""`

			`# Format data for output`
			`result_labels_fmt = [r[0] for r in self._result_labels]`
			`table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])`
			`return str(table)`
Implement yli.auto_correlations 2022-12-03 22:23:29 +11:00
			`def auto_correlations(df, cols):`
			`# TODO: Documentation`

			`def _col_to_numeric(col):`
			`if col.dtype == 'category' and col.cat.ordered:`
			`# Ordinal variable`
			`# Factorise if required`
			`col, _ = as_numeric(col)`

			`# Code as ranks`
			`col[col >= 0] = stats.rankdata(col[col >= 0])`

			`# Put NaNs back`
			`col = col.astype('float64')`
			`col[col < 0] = pd.NA`

			`return col`
			`else:`
			`# FIXME: Bools, binary, etc.`
			`return col`

			`# Code columns as numeric/ranks/etc. as appropriate`
			`df_coded = pd.DataFrame(index=df.index)`

			`for col_name in cols:`
			`col = df[col_name]`

			`if col.dtype == 'category' and col.cat.ordered:`
			`# Ordinal variable`
			`# Factorise if required`
			`col, _ = as_numeric(col)`

			`# Code as ranks`
			`col[col >= 0] = stats.rankdata(col[col >= 0])`

			`# Put NaNs back`
			`col = col.astype('float64')`
			`col[col < 0] = pd.NA`

			`df_coded[col_name] = col`
			`elif col.dtype in ('bool', 'boolean', 'category', 'object'):`
			`cat_values = col.dropna().unique()`

			`if len(cat_values) == 2:`
			`# Categorical variable with 2 categories`
			`# Code as 0/1/NA`
			`cat_values = sorted(cat_values)`
			`col = col.replace({cat_values[0]: 0, cat_values[1]: 1})`
			`df_coded[col_name] = col`
			`else:`
			`# Categorical variable with >2 categories`
			`# Create dummy variables`
			`dummies = pd.get_dummies(col, prefix=col_name)`
			`df_coded = df_coded.join(dummies)`
			`else:`
			`# Numeric variable, etc.`
			`df_coded[col_name] = col`

			`# Compute pairwise correlation`
			`df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')`

			`for i, col1 in enumerate(df_coded.columns):`
			`for col2 in df_coded.columns[:i]:`
			`statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic`
			`df_corr.loc[col1, col2] = statistic`
			`df_corr.loc[col2, col1] = statistic`

			`# Correlation with itself is always 1`
			`df_corr.loc[col1, col1] = 1`

			`return AutoCorrelationsResult(df_corr)`

			`class AutoCorrelationsResult:`
			`# TODO: Documentation`

			`def __init__(self, correlations):`
			`self.correlations = correlations`

			`def __repr__(self):`
			`if config.repr_is_summary:`
			`return self.summary()`
			`return super().__repr__()`

			`def _repr_html_(self):`
			`df_repr = self.correlations._repr_html_()`

			`# Insert caption`
			`idx_endopen = df_repr.index('>', df_repr.index('<table'))`
			`df_repr = df_repr[:idx_endopen+1] + '<caption>Correlation Matrix</caption>' + df_repr[idx_endopen+1:]`

			`return df_repr`

			`def summary(self):`
			`"""`
			`Return a stringified summary of the correlation matrix`

			`:rtype: str`
			`"""`

			`return 'Correlation Matrix\n\n' + str(self.correlations)`

			`def plot(self):`
			`sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')`