diff --git a/tests/test_regress.py b/tests/test_regress.py index 33a395e..7bb6de6 100644 --- a/tests/test_regress.py +++ b/tests/test_regress.py @@ -21,6 +21,7 @@ import pandas as pd import statsmodels.api as sm import yli +from yli.regress import CategoricalTerm def test_regress_ols_ol11_4(): """Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7""" @@ -122,6 +123,31 @@ def test_regress_logit_ol12_23(): assert expbeta_gam.ci_lower == approx(0.924, abs=0.001) assert expbeta_gam.ci_upper == approx(1.477, abs=0.001) +def test_regress_logit_ol10_18(): + """Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18""" + + data = [ + (False, False, 250), + (True, False, 750), + (False, True, 400), + (True, True, 1600) + ] + + df = pd.DataFrame({ + 'Response': np.repeat([d[0] for d in data], [d[2] for d in data]), + 'Stress': np.repeat([d[1] for d in data], [d[2] for d in data]) + }) + + result = yli.regress(sm.Logit, df, 'Stress', 'Response') + + assert isinstance(result.terms['Response'], CategoricalTerm) + assert result.terms['Response'].ref_category == False + + expbeta = np.exp(result.terms['Response'].categories['True'].beta) + assert expbeta.point == approx(1.333, abs=0.001) + assert expbeta.ci_lower == approx(1.113, abs=0.001) + assert expbeta.ci_upper == approx(1.596, abs=0.001) + def test_regress_penalisedlogit_kleinman(): """Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html""" diff --git a/yli/regress.py b/yli/regress.py index 777e83d..2bb07f5 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -27,7 +27,7 @@ import itertools from .bayes_factors import BayesFactor, bayesfactor_afbf from .sig_tests import FTestResult -from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category +from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term def vif(df, formula=None, nan_policy='warn'): """ @@ -38,7 +38,7 @@ def vif(df, formula=None, nan_policy='warn'): if formula: # Only consider columns in the formula - df = df[cols_for_formula(formula)] + df = df[cols_for_formula(formula, df)] # Check for/clean NaNs df = check_nan(df, nan_policy) @@ -352,7 +352,7 @@ def regress( exp = False # Check for/clean NaNs - df = df[[dep] + cols_for_formula(formula)] + df = df[[dep] + cols_for_formula(formula, df)] df = check_nan(df, nan_policy) # Ensure numeric type for dependent variable @@ -386,25 +386,22 @@ def regress( # Intercept term (single term) term = '(Intercept)' terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) - elif '[T.' in raw_name: - # Categorical term - term = raw_name[:raw_name.index('[T.')] - category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')] - - patsy_factor = term - if term.startswith('C('): - term = term[2:-1] - - # Add a new categorical term if not exists - if term not in terms: - ref_category = formula_factor_ref_category(formula, df, patsy_factor) - terms[term] = CategoricalTerm({}, ref_category) - - terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) else: - # Single term - term = raw_name - terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + # Parse if required + factor, column, contrast = parse_patsy_term(formula, df, raw_name) + + if contrast is not None: + # Categorical term + # Add a new categorical term if not exists + if column not in terms: + ref_category = formula_factor_ref_category(formula, df, factor) + terms[column] = CategoricalTerm({}, ref_category) + + terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + else: + # Single term + term = raw_name + terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) # Fit null model (for llnull) if hasattr(result, 'llnull'): diff --git a/yli/utils.py b/yli/utils.py index 1d44e7d..895e84d 100644 --- a/yli/utils.py +++ b/yli/utils.py @@ -130,7 +130,7 @@ class Estimate: # -------------------------- # Patsy formula manipulation -def cols_for_formula(formula): +def cols_for_formula(formula, df): """Return the columns corresponding to the Patsy formula""" # Parse the formula @@ -141,24 +141,103 @@ def cols_for_formula(formula): for term in model_desc.rhs_termlist: for factor in term.factors: name = factor.name() - if '(' in name: - # FIXME: Is there a better way of doing this? - # FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y)) - name = name[name.index('(')+1:name.index(')')] + if name.startswith('C('): + # Contrasts expression + # Get the corresponding factor_info + factor_info = formula_get_factor_info(formula, df, name) + + # Evaluate the factor + categorical_box = factor_info.factor.eval(factor_info.state, df) + + # Get the column name + name = categorical_box.data.name cols.add(name) return list(cols) -def formula_factor_ref_category(formula, df, factor): - """Get the reference category for a term in a Patsy formula referring to a categorical factor""" +def formula_get_factor_info(formula, df, factor): + """Get the FactorInfo for a factor in a Patsy formula""" # Parse the formula design_info = patsy.dmatrix(formula, df).design_info # Get the corresponding factor_info factor_info = next(v for k, v in design_info.factor_infos.items() if k.name() == factor) + return factor_info + +def formula_factor_ref_category(formula, df, factor): + """Get the reference category for a term in a Patsy formula referring to a categorical factor""" - # FIXME: This does not handle complex expressions, e.g. C(x, Treatment(y)) - categories = factor_info.categories - return categories[0] + if '(' in factor and not factor.startswith('C('): + raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor)) + + # Get the factor_info + factor_info = formula_get_factor_info(formula, df, factor) + + if '(' not in factor: + # C(...) is not specified, so must be default + return factor_info.categories[0] + + # Evaluate the factor + categorical_box = factor_info.factor.eval(factor_info.state, df) + + if categorical_box.contrast is None or categorical_box.contrast is patsy.Treatment: + # Default Treatment contrast with default reference group: first category + return factor_info.categories[0] + + if isinstance(categorical_box.contrast, patsy.Treatment): + if categorical_box.contrast.reference is None: + # Default reference group: first category + return factor_info.categories[0] + + # Specified reference group + return categorical_box.contrast.reference + + raise Exception('Attempted to get reference category for unknown contrast type {}'.format(categorical_box.contrast.__class__.__name__)) + +def parse_patsy_term(formula, df, term): + """ + Parse a Patsy term into its component parts + + Returns: factor, column, contrast + e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z" + """ + + if '(' not in term: + if '[' in term: + if '[T.' not in term: + raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term)) + + # Treatment contrast term + factor = term[:term.index('[T.')] + contrast = term[term.index('[T.')+3:term.index(']')] + + return factor, factor, contrast + else: + # Nothing special + return term, term, None + + # Term contains '(' + + if not term.startswith('C('): + raise Exception('Attempted to parse term for unknown expression type "{}"'.format(term)) + + if '[' in term: + if '[T.' not in term: + raise Exception('Attempted to parse term for unknown contrast type "{}"'.format(term)) + + # Treatment contrast term + factor = term[:term.index('[T.')] + contrast = term[term.index('[T.')+3:term.index(']')] + else: + # Not a treatment contrast (I think this is impossible?) + raise Exception('Attempted to parse unsupported contrast-like term with no contrasts') + + factor_inner = factor[factor.index('(')+1:factor.rindex(')')] + if ',' in factor_inner: + column = factor_inner[:factor_inner.index(',')] + else: + column = factor_inner + + return factor, column, contrast