Allow cols_for_formula and reference category detection to support explicit contrasts
This commit is contained in:
parent
0391877296
commit
d1249914ff
@ -21,6 +21,7 @@ import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
|
||||
import yli
|
||||
from yli.regress import CategoricalTerm
|
||||
|
||||
def test_regress_ols_ol11_4():
|
||||
"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
|
||||
@ -122,6 +123,31 @@ def test_regress_logit_ol12_23():
|
||||
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
|
||||
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
|
||||
|
||||
def test_regress_logit_ol10_18():
|
||||
"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
|
||||
|
||||
data = [
|
||||
(False, False, 250),
|
||||
(True, False, 750),
|
||||
(False, True, 400),
|
||||
(True, True, 1600)
|
||||
]
|
||||
|
||||
df = pd.DataFrame({
|
||||
'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
|
||||
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
|
||||
})
|
||||
|
||||
result = yli.regress(sm.Logit, df, 'Stress', 'Response')
|
||||
|
||||
assert isinstance(result.terms['Response'], CategoricalTerm)
|
||||
assert result.terms['Response'].ref_category == False
|
||||
|
||||
expbeta = np.exp(result.terms['Response'].categories['True'].beta)
|
||||
assert expbeta.point == approx(1.333, abs=0.001)
|
||||
assert expbeta.ci_lower == approx(1.113, abs=0.001)
|
||||
assert expbeta.ci_upper == approx(1.596, abs=0.001)
|
||||
|
||||
def test_regress_penalisedlogit_kleinman():
|
||||
"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
|
||||
|
||||
|
@ -27,7 +27,7 @@ import itertools
|
||||
|
||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||
from .sig_tests import FTestResult
|
||||
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category
|
||||
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
|
||||
def vif(df, formula=None, nan_policy='warn'):
|
||||
"""
|
||||
@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'):
|
||||
|
||||
if formula:
|
||||
# Only consider columns in the formula
|
||||
df = df[cols_for_formula(formula)]
|
||||
df = df[cols_for_formula(formula, df)]
|
||||
|
||||
# Check for/clean NaNs
|
||||
df = check_nan(df, nan_policy)
|
||||
@ -352,7 +352,7 @@ def regress(
|
||||
exp = False
|
||||
|
||||
# Check for/clean NaNs
|
||||
df = df[[dep] + cols_for_formula(formula)]
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
@ -386,25 +386,22 @@ def regress(
|
||||
# Intercept term (single term)
|
||||
term = '(Intercept)'
|
||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
elif '[T.' in raw_name:
|
||||
# Categorical term
|
||||
term = raw_name[:raw_name.index('[T.')]
|
||||
category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
|
||||
|
||||
patsy_factor = term
|
||||
if term.startswith('C('):
|
||||
term = term[2:-1]
|
||||
|
||||
# Add a new categorical term if not exists
|
||||
if term not in terms:
|
||||
ref_category = formula_factor_ref_category(formula, df, patsy_factor)
|
||||
terms[term] = CategoricalTerm({}, ref_category)
|
||||
|
||||
terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
else:
|
||||
# Single term
|
||||
term = raw_name
|
||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
# Parse if required
|
||||
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
||||
|
||||
if contrast is not None:
|
||||
# Categorical term
|
||||
# Add a new categorical term if not exists
|
||||
if column not in terms:
|
||||
ref_category = formula_factor_ref_category(formula, df, factor)
|
||||
terms[column] = CategoricalTerm({}, ref_category)
|
||||
|
||||
terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
else:
|
||||
# Single term
|
||||
term = raw_name
|
||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
|
||||
# Fit null model (for llnull)
|
||||
if hasattr(result, 'llnull'):
|
||||
|
99
yli/utils.py
99
yli/utils.py
@ -130,7 +130,7 @@ class Estimate:
|
||||
# --------------------------
|
||||
# Patsy formula manipulation
|
||||
|
||||
def cols_for_formula(formula):
|
||||
def cols_for_formula(formula, df):
|
||||
"""Return the columns corresponding to the Patsy formula"""
|
||||
|
||||
# Parse the formula
|
||||
@ -141,24 +141,103 @@ def cols_for_formula(formula):
|
||||
for term in model_desc.rhs_termlist:
|
||||
for factor in term.factors:
|
||||
name = factor.name()
|
||||
if '(' in name:
|
||||
# FIXME: Is there a better way of doing this?
|
||||
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
|
||||
name = name[name.index('(')+1:name.index(')')]
|
||||
if name.startswith('C('):
|
||||
# Contrasts expression
|
||||
# Get the corresponding factor_info
|
||||
factor_info = formula_get_factor_info(formula, df, name)
|
||||
|
||||
# Evaluate the factor
|
||||
categorical_box = factor_info.factor.eval(factor_info.state, df)
|
||||
|
||||
# Get the column name
|
||||
name = categorical_box.data.name
|
||||
|
||||
cols.add(name)
|
||||
|
||||
return list(cols)
|
||||
|
||||
def formula_factor_ref_category(formula, df, factor):
|
||||
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
|
||||
def formula_get_factor_info(formula, df, factor):
|
||||
"""Get the FactorInfo for a factor in a Patsy formula"""
|
||||
|
||||
# Parse the formula
|
||||
design_info = patsy.dmatrix(formula, df).design_info
|
||||
|
||||
# Get the corresponding factor_info
|
||||
factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
|
||||
return factor_info
|
||||
|
||||
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
|
||||
categories = factor_info.categories
|
||||
return categories[0]
|
||||
def formula_factor_ref_category(formula, df, factor):
|
||||
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
|
||||
|
||||
if '(' in factor and not factor.startswith('C('):
|
||||
raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
|
||||
|
||||
# Get the factor_info
|
||||
factor_info = formula_get_factor_info(formula, df, factor)
|
||||
|
||||
if '(' not in factor:
|
||||
# C(...) is not specified, so must be default
|
||||
return factor_info.categories[0]
|
||||
|
||||
# Evaluate the factor
|
||||
categorical_box = factor_info.factor.eval(factor_info.state, df)
|
||||
|
||||
if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment:
|
||||
# Default Treatment contrast with default reference group: first category
|
||||
return factor_info.categories[0]
|
||||
|
||||
if isinstance(categorical_box.contrast, patsy.Treatment):
|
||||
if categorical_box.contrast.reference is None:
|
||||
# Default reference group: first category
|
||||
return factor_info.categories[0]
|
||||
|
||||
# Specified reference group
|
||||
return categorical_box.contrast.reference
|
||||
|
||||
raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__))
|
||||
|
||||
def parse_patsy_term(formula, df, term):
|
||||
"""
|
||||
Parse a Patsy term into its component parts
|
||||
|
||||
Returns: factor, column, contrast
|
||||
e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
|
||||
"""
|
||||
|
||||
if '(' not in term:
|
||||
if '[' in term:
|
||||
if '[T.' not in term:
|
||||
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
|
||||
|
||||
# Treatment contrast term
|
||||
factor = term[:term.index('[T.')]
|
||||
contrast = term[term.index('[T.')+3:term.index(']')]
|
||||
|
||||
return factor, factor, contrast
|
||||
else:
|
||||
# Nothing special
|
||||
return term, term, None
|
||||
|
||||
# Term contains '('
|
||||
|
||||
if not term.startswith('C('):
|
||||
raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term))
|
||||
|
||||
if '[' in term:
|
||||
if '[T.' not in term:
|
||||
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
|
||||
|
||||
# Treatment contrast term
|
||||
factor = term[:term.index('[T.')]
|
||||
contrast = term[term.index('[T.')+3:term.index(']')]
|
||||
else:
|
||||
# Not a treatment contrast (I think this is impossible?)
|
||||
raise Exception('Attempted to parse unsupported contrast-like term with no contrasts')
|
||||
|
||||
factor_inner = factor[factor.index('(')+1:factor.rindex(')')]
|
||||
if ',' in factor_inner:
|
||||
column = factor_inner[:factor_inner.index(',')]
|
||||
else:
|
||||
column = factor_inner
|
||||
|
||||
return factor, column, contrast
|
||||
|
Loading…
Reference in New Issue
Block a user