Allow cols_for_formula and reference category detection to support explicit contrasts
This commit is contained in:
parent
0391877296
commit
d1249914ff
@ -21,6 +21,7 @@ import pandas as pd
|
|||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
|
|
||||||
import yli
|
import yli
|
||||||
|
from yli.regress import CategoricalTerm
|
||||||
|
|
||||||
def test_regress_ols_ol11_4():
|
def test_regress_ols_ol11_4():
|
||||||
"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
|
"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
|
||||||
@ -122,6 +123,31 @@ def test_regress_logit_ol12_23():
|
|||||||
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
|
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
|
||||||
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
|
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
|
||||||
|
|
||||||
|
def test_regress_logit_ol10_18():
|
||||||
|
"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
|
||||||
|
|
||||||
|
data = [
|
||||||
|
(False, False, 250),
|
||||||
|
(True, False, 750),
|
||||||
|
(False, True, 400),
|
||||||
|
(True, True, 1600)
|
||||||
|
]
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
|
||||||
|
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
|
||||||
|
})
|
||||||
|
|
||||||
|
result = yli.regress(sm.Logit, df, 'Stress', 'Response')
|
||||||
|
|
||||||
|
assert isinstance(result.terms['Response'], CategoricalTerm)
|
||||||
|
assert result.terms['Response'].ref_category == False
|
||||||
|
|
||||||
|
expbeta = np.exp(result.terms['Response'].categories['True'].beta)
|
||||||
|
assert expbeta.point == approx(1.333, abs=0.001)
|
||||||
|
assert expbeta.ci_lower == approx(1.113, abs=0.001)
|
||||||
|
assert expbeta.ci_upper == approx(1.596, abs=0.001)
|
||||||
|
|
||||||
def test_regress_penalisedlogit_kleinman():
|
def test_regress_penalisedlogit_kleinman():
|
||||||
"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
|
"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ import itertools
|
|||||||
|
|
||||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||||
from .sig_tests import FTestResult
|
from .sig_tests import FTestResult
|
||||||
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category
|
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||||
|
|
||||||
def vif(df, formula=None, nan_policy='warn'):
|
def vif(df, formula=None, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'):
|
|||||||
|
|
||||||
if formula:
|
if formula:
|
||||||
# Only consider columns in the formula
|
# Only consider columns in the formula
|
||||||
df = df[cols_for_formula(formula)]
|
df = df[cols_for_formula(formula, df)]
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
df = check_nan(df, nan_policy)
|
df = check_nan(df, nan_policy)
|
||||||
@ -352,7 +352,7 @@ def regress(
|
|||||||
exp = False
|
exp = False
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
df = df[[dep] + cols_for_formula(formula)]
|
df = df[[dep] + cols_for_formula(formula, df)]
|
||||||
df = check_nan(df, nan_policy)
|
df = check_nan(df, nan_policy)
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
# Ensure numeric type for dependent variable
|
||||||
@ -386,25 +386,22 @@ def regress(
|
|||||||
# Intercept term (single term)
|
# Intercept term (single term)
|
||||||
term = '(Intercept)'
|
term = '(Intercept)'
|
||||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
elif '[T.' in raw_name:
|
|
||||||
# Categorical term
|
|
||||||
term = raw_name[:raw_name.index('[T.')]
|
|
||||||
category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
|
|
||||||
|
|
||||||
patsy_factor = term
|
|
||||||
if term.startswith('C('):
|
|
||||||
term = term[2:-1]
|
|
||||||
|
|
||||||
# Add a new categorical term if not exists
|
|
||||||
if term not in terms:
|
|
||||||
ref_category = formula_factor_ref_category(formula, df, patsy_factor)
|
|
||||||
terms[term] = CategoricalTerm({}, ref_category)
|
|
||||||
|
|
||||||
terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
|
||||||
else:
|
else:
|
||||||
# Single term
|
# Parse if required
|
||||||
term = raw_name
|
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
||||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
|
||||||
|
if contrast is not None:
|
||||||
|
# Categorical term
|
||||||
|
# Add a new categorical term if not exists
|
||||||
|
if column not in terms:
|
||||||
|
ref_category = formula_factor_ref_category(formula, df, factor)
|
||||||
|
terms[column] = CategoricalTerm({}, ref_category)
|
||||||
|
|
||||||
|
terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
|
else:
|
||||||
|
# Single term
|
||||||
|
term = raw_name
|
||||||
|
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
|
|
||||||
# Fit null model (for llnull)
|
# Fit null model (for llnull)
|
||||||
if hasattr(result, 'llnull'):
|
if hasattr(result, 'llnull'):
|
||||||
|
99
yli/utils.py
99
yli/utils.py
@ -130,7 +130,7 @@ class Estimate:
|
|||||||
# --------------------------
|
# --------------------------
|
||||||
# Patsy formula manipulation
|
# Patsy formula manipulation
|
||||||
|
|
||||||
def cols_for_formula(formula):
|
def cols_for_formula(formula, df):
|
||||||
"""Return the columns corresponding to the Patsy formula"""
|
"""Return the columns corresponding to the Patsy formula"""
|
||||||
|
|
||||||
# Parse the formula
|
# Parse the formula
|
||||||
@ -141,24 +141,103 @@ def cols_for_formula(formula):
|
|||||||
for term in model_desc.rhs_termlist:
|
for term in model_desc.rhs_termlist:
|
||||||
for factor in term.factors:
|
for factor in term.factors:
|
||||||
name = factor.name()
|
name = factor.name()
|
||||||
if '(' in name:
|
if name.startswith('C('):
|
||||||
# FIXME: Is there a better way of doing this?
|
# Contrasts expression
|
||||||
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
|
# Get the corresponding factor_info
|
||||||
name = name[name.index('(')+1:name.index(')')]
|
factor_info = formula_get_factor_info(formula, df, name)
|
||||||
|
|
||||||
|
# Evaluate the factor
|
||||||
|
categorical_box = factor_info.factor.eval(factor_info.state, df)
|
||||||
|
|
||||||
|
# Get the column name
|
||||||
|
name = categorical_box.data.name
|
||||||
|
|
||||||
cols.add(name)
|
cols.add(name)
|
||||||
|
|
||||||
return list(cols)
|
return list(cols)
|
||||||
|
|
||||||
def formula_factor_ref_category(formula, df, factor):
|
def formula_get_factor_info(formula, df, factor):
|
||||||
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
|
"""Get the FactorInfo for a factor in a Patsy formula"""
|
||||||
|
|
||||||
# Parse the formula
|
# Parse the formula
|
||||||
design_info = patsy.dmatrix(formula, df).design_info
|
design_info = patsy.dmatrix(formula, df).design_info
|
||||||
|
|
||||||
# Get the corresponding factor_info
|
# Get the corresponding factor_info
|
||||||
factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
|
factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor)
|
||||||
|
return factor_info
|
||||||
|
|
||||||
|
def formula_factor_ref_category(formula, df, factor):
|
||||||
|
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
|
||||||
|
|
||||||
# FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y))
|
if '(' in factor and not factor.startswith('C('):
|
||||||
categories = factor_info.categories
|
raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
|
||||||
return categories[0]
|
|
||||||
|
# Get the factor_info
|
||||||
|
factor_info = formula_get_factor_info(formula, df, factor)
|
||||||
|
|
||||||
|
if '(' not in factor:
|
||||||
|
# C(...) is not specified, so must be default
|
||||||
|
return factor_info.categories[0]
|
||||||
|
|
||||||
|
# Evaluate the factor
|
||||||
|
categorical_box = factor_info.factor.eval(factor_info.state, df)
|
||||||
|
|
||||||
|
if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment:
|
||||||
|
# Default Treatment contrast with default reference group: first category
|
||||||
|
return factor_info.categories[0]
|
||||||
|
|
||||||
|
if isinstance(categorical_box.contrast, patsy.Treatment):
|
||||||
|
if categorical_box.contrast.reference is None:
|
||||||
|
# Default reference group: first category
|
||||||
|
return factor_info.categories[0]
|
||||||
|
|
||||||
|
# Specified reference group
|
||||||
|
return categorical_box.contrast.reference
|
||||||
|
|
||||||
|
raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__))
|
||||||
|
|
||||||
|
def parse_patsy_term(formula, df, term):
|
||||||
|
"""
|
||||||
|
Parse a Patsy term into its component parts
|
||||||
|
|
||||||
|
Returns: factor, column, contrast
|
||||||
|
e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
|
||||||
|
"""
|
||||||
|
|
||||||
|
if '(' not in term:
|
||||||
|
if '[' in term:
|
||||||
|
if '[T.' not in term:
|
||||||
|
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
|
||||||
|
|
||||||
|
# Treatment contrast term
|
||||||
|
factor = term[:term.index('[T.')]
|
||||||
|
contrast = term[term.index('[T.')+3:term.index(']')]
|
||||||
|
|
||||||
|
return factor, factor, contrast
|
||||||
|
else:
|
||||||
|
# Nothing special
|
||||||
|
return term, term, None
|
||||||
|
|
||||||
|
# Term contains '('
|
||||||
|
|
||||||
|
if not term.startswith('C('):
|
||||||
|
raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term))
|
||||||
|
|
||||||
|
if '[' in term:
|
||||||
|
if '[T.' not in term:
|
||||||
|
raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term))
|
||||||
|
|
||||||
|
# Treatment contrast term
|
||||||
|
factor = term[:term.index('[T.')]
|
||||||
|
contrast = term[term.index('[T.')+3:term.index(']')]
|
||||||
|
else:
|
||||||
|
# Not a treatment contrast (I think this is impossible?)
|
||||||
|
raise Exception('Attempted to parse unsupported contrast-like term with no contrasts')
|
||||||
|
|
||||||
|
factor_inner = factor[factor.index('(')+1:factor.rindex(')')]
|
||||||
|
if ',' in factor_inner:
|
||||||
|
column = factor_inner[:factor_inner.index(',')]
|
||||||
|
else:
|
||||||
|
column = factor_inner
|
||||||
|
|
||||||
|
return factor, column, contrast
|
||||||
|
Loading…
Reference in New Issue
Block a user