Allow cols_for_formula and reference category detection to support explicit contrasts

This commit is contained in:
RunasSudo 2022-10-15 01:09:40 +11:00
parent 0391877296
commit d1249914ff
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
3 changed files with 133 additions and 31 deletions

View File

@ -21,6 +21,7 @@ import pandas as pd
import statsmodels.api as sm
import yli
from yli.regress import CategoricalTerm
def test_regress_ols_ol11_4():
"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
@ -122,6 +123,31 @@ def test_regress_logit_ol12_23():
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
def test_regress_logit_ol10_18():
"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
data = [
(False, False, 250),
(True, False, 750),
(False, True, 400),
(True, True, 1600)
]
df = pd.DataFrame({
'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
})
result = yli.regress(sm.Logit, df, 'Stress', 'Response')
assert isinstance(result.terms['Response'], CategoricalTerm)
assert result.terms['Response'].ref_category == False
expbeta = np.exp(result.terms['Response'].categories['True'].beta)
assert expbeta.point == approx(1.333, abs=0.001)
assert expbeta.ci_lower == approx(1.113, abs=0.001)
assert expbeta.ci_upper == approx(1.596, abs=0.001)
def test_regress_penalisedlogit_kleinman():
"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""

View File

@ -27,7 +27,7 @@ import itertools
from .bayes_factors import BayesFactor, bayesfactor_afbf
from .sig_tests import FTestResult
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term
def vif(df, formula=None, nan_policy='warn'):
"""
@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'):
if formula:
# Only consider columns in the formula
df = df[cols_for_formula(formula)]
df = df[cols_for_formula(formula, df)]
# Check for/clean NaNs
df = check_nan(df, nan_policy)
@ -352,7 +352,7 @@ def regress(
exp = False
# Check for/clean NaNs
df = df[[dep] + cols_for_formula(formula)]
df = df[[dep] + cols_for_formula(formula, df)]
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
@ -386,25 +386,22 @@ def regress(
# Intercept term (single term)
term = '(Intercept)'
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
elif '[T.' in raw_name:
# Categorical term
term = raw_name[:raw_name.index('[T.')]
category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
patsy_factor = term
if term.startswith('C('):
term = term[2:-1]
# Add a new categorical term if not exists
if term not in terms:
ref_category = formula_factor_ref_category(formula, df, patsy_factor)
terms[term] = CategoricalTerm({}, ref_category)
terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
else:
# Single term
term = raw_name
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
# Parse if required
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
if contrast is not None:
# Categorical term
# Add a new categorical term if not exists
if column not in terms:
ref_category = formula_factor_ref_category(formula, df, factor)
terms[column] = CategoricalTerm({}, ref_category)
terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
else:
# Single term
term = raw_name
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
# Fit null model (for llnull)
if hasattr(result, 'llnull'):

View File

@ -130,7 +130,7 @@ class Estimate:
# --------------------------
# Patsy formula manipulation
def cols_for_formula(formula):
def cols_for_formula(formula, df):
"""Return the columns corresponding to the Patsy formula"""
# Parse the formula
@ -141,24 +141,103 @@ def cols_for_formula(formula):
for term in model_desc.rhs_termlist:
for factor in term.factors:
name = factor.name()
if '(' in name:
# FIXME: Is there a better way of doing this?
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
name = name[name.index('(')+1:name.index(')')]
if name.startswith('C('):
# Contrasts expression
# Get the corresponding factor_info
factor_info = formula_get_factor_info(formula, df, name)
# Evaluate the factor
categorical_box = factor_info.factor.eval(factor_info.state, df)
# Get the column name
name = categorical_box.data.name
cols.add(name)
return list(cols)
def formula_factor_ref_category(formula, df, factor):
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
def formula_get_factor_info(formula, df, factor):
"""Get the FactorInfo for a factor in a Patsy formula"""
# Parse the formula
design_info = patsy.dmatrix(formula, df).design_info
# Get the corresponding factor_info
factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
return factor_info
def formula_factor_ref_category(formula, df, factor):
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
categories = factor_info.categories
return categories[0]
if '(' in factor and not factor.startswith('C('):
raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
# Get the factor_info
factor_info = formula_get_factor_info(formula, df, factor)
if '(' not in factor:
# C(...) is not specified, so must be default
return factor_info.categories[0]
# Evaluate the factor
categorical_box = factor_info.factor.eval(factor_info.state, df)
if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment:
# Default Treatment contrast with default reference group: first category
return factor_info.categories[0]
if isinstance(categorical_box.contrast, patsy.Treatment):
if categorical_box.contrast.reference is None:
# Default reference group: first category
return factor_info.categories[0]
# Specified reference group
return categorical_box.contrast.reference
raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__))
def parse_patsy_term(formula, df, term):
"""
Parse a Patsy term into its component parts
Returns: factor, column, contrast
e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
"""
if '(' not in term:
if '[' in term:
if '[T.' not in term:
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
# Treatment contrast term
factor = term[:term.index('[T.')]
contrast = term[term.index('[T.')+3:term.index(']')]
return factor, factor, contrast
else:
# Nothing special
return term, term, None
# Term contains '('
if not term.startswith('C('):
raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term))
if '[' in term:
if '[T.' not in term:
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
# Treatment contrast term
factor = term[:term.index('[T.')]
contrast = term[term.index('[T.')+3:term.index(']')]
else:
# Not a treatment contrast (I think this is impossible?)
raise Exception('Attempted to parse unsupported contrast-like term with no contrasts')
factor_inner = factor[factor.index('(')+1:factor.rindex(')')]
if ',' in factor_inner:
column = factor_inner[:factor_inner.index(',')]
else:
column = factor_inner
return factor, column, contrast