In auto_descriptives, autodetect ordinal variables based on category dtype

2022-12-03 22:19:24 +11:00 · 2022-12-03 22:19:24 +11:00 · 0fa261498a
commit 0fa261498a
parent 5633a191f1
1 changed files with 37 additions and 36 deletions
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -19,7 +19,7 @@ import pandas as pd
 from .config import config
 from .utils import check_nan

-def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
+def auto_descriptives(df, cols, *, ordinal_range=[]):
 	"""
 	Automatically compute descriptive summary statistics
 	
@ -27,7 +27,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 	
 	* For a categorical variable – Counts of values
 	* For a continuous variable – Mean and standard deviation
-	* For an ordinal variable – Median and range or IQR
+	* For an ordinal variable – Median and IQR (default) or range
 	
 	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
 	
@ -35,10 +35,8 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 	:type df: DataFrame
 	:param cols: Columns in *df* for the variables to summarise
 	:type cols: List[str]
-	:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
+	:param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
 	:type ordinal_range: List[str]
-	:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
-	:type ordinal_iqr: List[str]
 	
 	:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
 	"""
@ -49,7 +47,31 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 	for col in cols:
 		data_cleaned = df[col].dropna()
 		
-		if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
+		if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+			# Ordinal numeric data
+			data_cleaned = data_cleaned.astype('float64')
+			
+			if col in ordinal_range:
+				# Report range
+				result_labels.append((
+					'{}, median (range)'.format(col),
+					'{}, median (range)'.format(col),
+				))
+				result_data.append((
+					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
+					len(df) - len(data_cleaned)
+				))
+			else:
+				# Report IQR
+				result_labels.append((
+					'{}, median (IQR)'.format(col),
+					'{}, median (IQR)'.format(col),
+				))
+				result_data.append((
+					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
+					len(df) - len(data_cleaned)
+				))
+		elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
 			# Categorical data
 			# FIXME: Sort order
 			values = sorted(data_cleaned.unique())
@ -64,36 +86,15 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 				len(df) - len(data_cleaned)
 			))
 		elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
-			if col in ordinal_range:
-				# Ordinal data (report range)
-				result_labels.append((
-					'{}, median (range)'.format(col),
-					'{}, median (range)'.format(col),
-				))
-				result_data.append((
-					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
-					len(df) - len(data_cleaned)
-				))
-			elif col in ordinal_iqr:
-				# Ordinal data (report IQR)
-				result_labels.append((
-					'{}, median (IQR)'.format(col),
-					'{}, median (IQR)'.format(col),
-				))
-				result_data.append((
-					'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
-					len(df) - len(data_cleaned)
-				))
-			else:
-				# Continuous data
-				result_labels.append((
-					'{}, μ (SD)'.format(col),
-					'{}, <i>μ</i> (SD)'.format(col),
-				))
-				result_data.append((
-					'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
-					len(df) - len(data_cleaned)
-				))
+			# Continuous data
+			result_labels.append((
+				'{}, μ (SD)'.format(col),
+				'{}, <i>μ</i> (SD)'.format(col),
+			))
+			result_data.append((
+				'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
+				len(df) - len(data_cleaned)
+			))
 		else:
 			raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))