Allow cols_for_formula and reference category detection to support explicit contrasts

2022-10-15 01:09:40 +11:00 · 2022-10-15 01:09:40 +11:00 · d1249914ff
commit d1249914ff
parent 0391877296
3 changed files with 133 additions and 31 deletions
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -21,6 +21,7 @@ import pandas as pd
 import statsmodels.api as sm
 import yli
 from yli.regress import CategoricalTerm
 def test_regress_ols_ol11_4():
 	"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
@ -122,6 +123,31 @@ def test_regress_logit_ol12_23():
 	assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
 	assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
 def test_regress_logit_ol10_18():
 	"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
 	data = [
 		(False, False, 250),
 		(True, False, 750),
 		(False, True, 400),
 		(True, True, 1600)
 	]
 	df = pd.DataFrame({
 		'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
 		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
 	})
 	result = yli.regress(sm.Logit, df, 'Stress', 'Response')
 	assert isinstance(result.terms['Response'], CategoricalTerm)
 	assert result.terms['Response'].ref_category == False
 	expbeta = np.exp(result.terms['Response'].categories['True'].beta)
 	assert expbeta.point == approx(1.333, abs=0.001)
 	assert expbeta.ci_lower == approx(1.113, abs=0.001)
 	assert expbeta.ci_upper == approx(1.596, abs=0.001)
 def test_regress_penalisedlogit_kleinman():
 	"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
--- a/yli/regress.py
+++ b/yli/regress.py
@ -27,7 +27,7 @@ import itertools
 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .sig_tests import FTestResult
-from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category
+from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term
 def vif(df, formula=None, nan_policy='warn'):
 	"""
@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'):
 	if formula:
 		# Only consider columns in the formula
-		df = df[cols_for_formula(formula)]
+		df = df[cols_for_formula(formula, df)]
 	# Check for/clean NaNs
 	df = check_nan(df, nan_policy)
@ -352,7 +352,7 @@ def regress(
 			exp = False
 	# Check for/clean NaNs
-	df = df[[dep] + cols_for_formula(formula)]
+	df = df[[dep] + cols_for_formula(formula, df)]
 	df = check_nan(df, nan_policy)
 	# Ensure numeric type for dependent variable
@ -386,25 +386,22 @@ def regress(
 			# Intercept term (single term)
 			term = '(Intercept)'
 			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 		elif '[T.' in raw_name:
 			# Categorical term
 			term = raw_name[:raw_name.index('[T.')]
 			category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
 			patsy_factor = term
 			if term.startswith('C('):
 				term = term[2:-1]
 			# Add a new categorical term if not exists
 			if term not in terms:
 				ref_category = formula_factor_ref_category(formula, df, patsy_factor)
 				terms[term] = CategoricalTerm({}, ref_category)
 			terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 		else:
-			# Single term
+			# Parse if required
-			term = raw_name
+			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
-			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+			
 			if contrast is not None:
 				# Categorical term
 				# Add a new categorical term if not exists
 				if column not in terms:
 					ref_category = formula_factor_ref_category(formula, df, factor)
 					terms[column] = CategoricalTerm({}, ref_category)
 				terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 			else:
 				# Single term
 				term = raw_name
 				terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
 	# Fit null model (for llnull)
 	if hasattr(result, 'llnull'):
--- a/yli/utils.py
+++ b/yli/utils.py
@ -130,7 +130,7 @@ class Estimate:
 # --------------------------
 # Patsy formula manipulation
-def cols_for_formula(formula):
+def cols_for_formula(formula, df):
 	"""Return the columns corresponding to the Patsy formula"""
 	# Parse the formula
@ -141,24 +141,103 @@ def cols_for_formula(formula):
 	for term in model_desc.rhs_termlist:
 		for factor in term.factors:
 			name = factor.name()
-			if '(' in name:
+			if name.startswith('C('):
-				# FIXME: Is there a better way of doing this?
+				# Contrasts expression
-				# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
+				# Get the corresponding factor_info
-				name = name[name.index('(')+1:name.index(')')]
+				factor_info = formula_get_factor_info(formula, df, name)
 				# Evaluate the factor
 				categorical_box = factor_info.factor.eval(factor_info.state, df)
 				# Get the column name
 				name = categorical_box.data.name
 			cols.add(name)
 	return list(cols)
-def formula_factor_ref_category(formula, df, factor):
+def formula_get_factor_info(formula, df, factor):
-	"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
+	"""Get the FactorInfo for a factor in a Patsy formula"""
 	# Parse the formula
 	design_info = patsy.dmatrix(formula, df).design_info
 	# Get the corresponding factor_info
 	factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
 	return factor_info
 def formula_factor_ref_category(formula, df, factor):
 	"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
-	# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
+	if '(' in factor and not factor.startswith('C('):
-	categories = factor_info.categories
+		raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
-	return categories[0]
+	
 	# Get the factor_info
 	factor_info = formula_get_factor_info(formula, df, factor)
 	if '(' not in factor:
 		# C(...) is not specified, so must be default
 		return factor_info.categories[0]
 	# Evaluate the factor
 	categorical_box = factor_info.factor.eval(factor_info.state, df)
 	if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment:
 		# Default Treatment contrast with default reference group: first category
 		return factor_info.categories[0]
 	if isinstance(categorical_box.contrast, patsy.Treatment):
 		if categorical_box.contrast.reference is None:
 			# Default reference group: first category
 			return factor_info.categories[0]
 		# Specified reference group
 		return categorical_box.contrast.reference
 	raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__))
 def parse_patsy_term(formula, df, term):
 	"""
 	Parse a Patsy term into its component parts
 	Returns: factor, column, contrast
 	e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
 	"""
 	if '(' not in term:
 		if '[' in term:
 			if '[T.' not in term:
 				raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
 			# Treatment contrast term
 			factor = term[:term.index('[T.')]
 			contrast = term[term.index('[T.')+3:term.index(']')]
 			return factor, factor, contrast
 		else:
 			# Nothing special
 			return term, term, None
 	# Term contains '('
 	if not term.startswith('C('):
 		raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term))
 	if '[' in term:
 		if '[T.' not in term:
 			raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
 		# Treatment contrast term
 		factor = term[:term.index('[T.')]
 		contrast = term[term.index('[T.')+3:term.index(']')]
 	else:
 		# Not a treatment contrast (I think this is impossible?)
 		raise Exception('Attempted to parse unsupported contrast-like term with no contrasts')
 	factor_inner = factor[factor.index('(')+1:factor.rindex(')')]
 	if ',' in factor_inner:
 		column = factor_inner[:factor_inner.index(',')]
 	else:
 		column = factor_inner
 	return factor, column, contrast