#   scipy-yli: Helpful SciPy utilities and recipes
#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import pandas as pd

from .config import config
from .utils import check_nan

def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
	"""
	Automatically compute descriptive summary statistics
	
	The statistics computed are:
	
	* For a categorical variable – Counts of values
	* For a continuous variable – Mean and standard deviation
	* For an ordinal variable – Median and range or IQR
	
	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
	
	:param df: Data to summarise
	:type df: DataFrame
	:param cols: Columns in *df* for the variables to summarise
	:type cols: List[str]
	:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
	:type ordinal_range: List[str]
	:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
	:type ordinal_iqr: List[str]
	
	:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
	"""
	
	result_data = []
	result_labels = []
	
	for col in cols:
		data_cleaned = df[col].dropna()
		
		if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
			# Categorical data
			# FIXME: Sort order
			values = sorted(data_cleaned.unique())
			
			# Value counts
			result_labels.append((
				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
				'{}, {}'.format(col, ':'.join(str(v) for v in values)),
			))
			result_data.append((
				':'.join(str((data_cleaned == v).sum()) for v in values),
				len(df) - len(data_cleaned)
			))
		elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
			if col in ordinal_range:
				# Ordinal data (report range)
				result_labels.append((
					'{}, median (range)'.format(col),
					'{}, median (range)'.format(col),
				))
				result_data.append((
					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
					len(df) - len(data_cleaned)
				))
			elif col in ordinal_iqr:
				# Ordinal data (report IQR)
				result_labels.append((
					'{}, median (IQR)'.format(col),
					'{}, median (IQR)'.format(col),
				))
				result_data.append((
					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
					len(df) - len(data_cleaned)
				))
			else:
				# Continuous data
				result_labels.append((
					'{}, μ (SD)'.format(col),
					'{}, <i>μ</i> (SD)'.format(col),
				))
				result_data.append((
					'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
					len(df) - len(data_cleaned)
				))
		else:
			raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
	
	return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)

class AutoDescriptivesResult:
	"""
	Result of automatically computed descriptive summary statistics
	
	See :func:`yli.auto_descriptives`.
	
	Results data stored within instances of this class is not intended to be directly accessed.
	"""
	
	def __init__(self, *, result_data, result_labels):
		# List of tuples (variable summary, missing count)
		self._result_data = result_data
		# List of tuples (plaintext label, HTML label)
		self._result_labels = result_labels
	
	def __repr__(self):
		if config.repr_is_summary:
			return self.summary()
		return super().__repr__()
	
	def _repr_html_(self):
		result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
		
		for data, label in zip(self._result_data, self._result_labels):
			result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
		
		result += '</tbody></table>'
		return result
	
	def summary(self):
		"""
		Return a stringified summary of the tests of association
		
		:rtype: str
		"""
		
		# Format data for output
		result_labels_fmt = [r[0] for r in self._result_labels]
		table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
		return str(table)