Allow cols_for_formula and reference category detection to support explicit contrasts

2022-10-15 01:09:40 +11:00 · 2022-10-15 01:09:40 +11:00 · d1249914ff
commit d1249914ff
parent 0391877296
3 changed files with 133 additions and 31 deletions
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -21,6 +21,7 @@ import pandas as pd
 import statsmodels.api as sm

 import yli
+from yli.regress import CategoricalTerm

 def test_regress_ols_ol11_4():
 	"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
@ -122,6 +123,31 @@ def test_regress_logit_ol12_23():
 	assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
 	assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)

+def test_regress_logit_ol10_18():
+	"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
+	
+	data = [
+		(False, False, 250),
+		(True, False, 750),
+		(False, True, 400),
+		(True, True, 1600)
+	]
+	
+	df = pd.DataFrame({
+		'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
+		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
+	})
+	
+	result = yli.regress(sm.Logit, df, 'Stress', 'Response')
+	
+	assert isinstance(result.terms['Response'], CategoricalTerm)
+	assert result.terms['Response'].ref_category == False
+	
+	expbeta = np.exp(result.terms['Response'].categories['True'].beta)
+	assert expbeta.point == approx(1.333, abs=0.001)
+	assert expbeta.ci_lower == approx(1.113, abs=0.001)
+	assert expbeta.ci_upper == approx(1.596, abs=0.001)
+
 def test_regress_penalisedlogit_kleinman():
 	"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
 	
--- a/yli/regress.py
+++ b/yli/regress.py
@ -27,7 +27,7 @@ import itertools

 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .sig_tests import FTestResult
-from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category
+from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term

 def vif(df, formula=None, nan_policy='warn'):
 	"""
@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'):
 	
 	if formula:
 		# Only consider columns in the formula
-		df = df[cols_for_formula(formula)]
+		df = df[cols_for_formula(formula, df)]
 	
 	# Check for/clean NaNs
 	df = check_nan(df, nan_policy)
@ -352,7 +352,7 @@ def regress(
 			exp = False
 	
 	# Check for/clean NaNs
-	df = df[[dep] + cols_for_formula(formula)]
+	df = df[[dep] + cols_for_formula(formula, df)]
 	df = check_nan(df, nan_policy)
 	
 	# Ensure numeric type for dependent variable
@ -386,25 +386,22 @@ def regress(
 			# Intercept term (single term)
 			term = '(Intercept)'
 			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
-		elif '[T.' in raw_name:
-			# Categorical term
-			term = raw_name[:raw_name.index('[T.')]
-			category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
-			
-			patsy_factor = term
-			if term.startswith('C('):
-				term = term[2:-1]
-			
-			# Add a new categorical term if not exists
-			if term not in terms:
-				ref_category = formula_factor_ref_category(formula, df, patsy_factor)
-				terms[term] = CategoricalTerm({}, ref_category)
-			
-			terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 		else:
-			# Single term
-			term = raw_name
-			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+			# Parse if required
+			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
+			
+			if contrast is not None:
+				# Categorical term
+				# Add a new categorical term if not exists
+				if column not in terms:
+					ref_category = formula_factor_ref_category(formula, df, factor)
+					terms[column] = CategoricalTerm({}, ref_category)
+				
+				terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+			else:
+				# Single term
+				term = raw_name
+				terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 	
 	# Fit null model (for llnull)
 	if hasattr(result, 'llnull'):
--- a/yli/utils.py
+++ b/yli/utils.py
@ -130,7 +130,7 @@ class Estimate:
 # --------------------------
 # Patsy formula manipulation

-def cols_for_formula(formula):
+def cols_for_formula(formula, df):
 	"""Return the columns corresponding to the Patsy formula"""
 	
 	# Parse the formula
@ -141,24 +141,103 @@ def cols_for_formula(formula):
 	for term in model_desc.rhs_termlist:
 		for factor in term.factors:
 			name = factor.name()
-			if '(' in name:
-				# FIXME: Is there a better way of doing this?
-				# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
-				name = name[name.index('(')+1:name.index(')')]
+			if name.startswith('C('):
+				# Contrasts expression
+				# Get the corresponding factor_info
+				factor_info = formula_get_factor_info(formula, df, name)
+				
+				# Evaluate the factor
+				categorical_box = factor_info.factor.eval(factor_info.state, df)
+				
+				# Get the column name
+				name = categorical_box.data.name
 			
 			cols.add(name)
 	
 	return list(cols)

-def formula_factor_ref_category(formula, df, factor):
-	"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
+def formula_get_factor_info(formula, df, factor):
+	"""Get the FactorInfo for a factor in a Patsy formula"""
 	
 	# Parse the formula
 	design_info = patsy.dmatrix(formula, df).design_info
 	
 	# Get the corresponding factor_info
 	factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
+	return factor_info

-	# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
-	categories = factor_info.categories
-	return categories[0]
+def formula_factor_ref_category(formula, df, factor):
+	"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
+	
+	if '(' in factor and not factor.startswith('C('):
+		raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
+	
+	# Get the factor_info
+	factor_info = formula_get_factor_info(formula, df, factor)
+	
+	if '(' not in factor:
+		# C(...) is not specified, so must be default
+		return factor_info.categories[0]
+	
+	# Evaluate the factor
+	categorical_box = factor_info.factor.eval(factor_info.state, df)
+	
+	if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment:
+		# Default Treatment contrast with default reference group: first category
+		return factor_info.categories[0]
+	
+	if isinstance(categorical_box.contrast, patsy.Treatment):
+		if categorical_box.contrast.reference is None:
+			# Default reference group: first category
+			return factor_info.categories[0]
+		
+		# Specified reference group
+		return categorical_box.contrast.reference
+	
+	raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__))
+
+def parse_patsy_term(formula, df, term):
+	"""
+	Parse a Patsy term into its component parts
+	
+	Returns: factor, column, contrast
+	e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
+	"""
+	
+	if '(' not in term:
+		if '[' in term:
+			if '[T.' not in term:
+				raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
+			
+			# Treatment contrast term
+			factor = term[:term.index('[T.')]
+			contrast = term[term.index('[T.')+3:term.index(']')]
+			
+			return factor, factor, contrast
+		else:
+			# Nothing special
+			return term, term, None
+	
+	# Term contains '('
+	
+	if not term.startswith('C('):
+		raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term))
+	
+	if '[' in term:
+		if '[T.' not in term:
+			raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
+		
+		# Treatment contrast term
+		factor = term[:term.index('[T.')]
+		contrast = term[term.index('[T.')+3:term.index(']')]
+	else:
+		# Not a treatment contrast (I think this is impossible?)
+		raise Exception('Attempted to parse unsupported contrast-like term with no contrasts')
+	
+	factor_inner = factor[factor.index('(')+1:factor.rindex(')')]
+	if ',' in factor_inner:
+		column = factor_inner[:factor_inner.index(',')]
+	else:
+		column = factor_inner
+	
+	return factor, column, contrast