diff --git a/tests/test_regress.py b/tests/test_regress.py index 89ba21b..33a395e 100644 --- a/tests/test_regress.py +++ b/tests/test_regress.py @@ -38,13 +38,13 @@ def test_regress_ols_ol11_4(): assert result.ftest().pvalue < 0.0005 assert result.rsquared == approx(0.7429, abs=0.0001) - assert result.beta['Intercept'].point == approx(47.48, abs=0.01) - assert result.pvalues['Intercept'] < 0.0005 - assert result.beta['SoilPh'].point == approx(-7.86, abs=0.01) - assert result.pvalues['SoilPh'] < 0.0005 + assert result.terms['(Intercept)'].beta.point == approx(47.48, abs=0.01) + assert result.terms['(Intercept)'].pvalue < 0.0005 + assert result.terms['SoilPh'].beta.point == approx(-7.86, abs=0.01) + assert result.terms['SoilPh'].pvalue < 0.0005 - assert result.beta['SoilPh'].ci_lower == approx(-10.15, abs=0.01) - assert result.beta['SoilPh'].ci_upper == approx(-5.57, abs=0.01) + assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01) + assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01) def test_regress_ols_ol13_5(): """Compare yli.regress for Ott & Longnecker (2016) chapter 13.5""" @@ -72,28 +72,28 @@ def test_regress_ols_ol13_5(): assert result.ftest().pvalue < 0.00005 assert result.rsquared == approx(0.8635, abs=0.0001) - assert result.beta['Intercept'].point == approx(-10.63398, abs=0.00001) - assert result.pvalues['Intercept'] == approx(0.0766, abs=0.0001) - assert result.beta['D'].point == approx(0.22760, abs=0.00001) - assert result.pvalues['D'] == approx(0.0157, abs=0.0001) - assert result.beta['T1'].point == approx(0.00525, abs=0.00001) - assert result.pvalues['T1'] == approx(0.8161, abs=0.0001) - assert result.beta['T2'].point == approx(0.00561, abs=0.00001) - assert result.pvalues['T2'] == approx(0.2360, abs=0.0001) - assert result.beta['S'].point == approx(0.00088369, abs=0.00000001) - assert result.pvalues['S'] < 0.0001 - assert result.beta['PR'].point == approx(-0.10813, abs=0.00001) - assert result.pvalues['PR'] == approx(0.2094, abs=0.0001) - assert result.beta['NE'].point == approx(0.25949, abs=0.00001) - assert result.pvalues['NE'] == approx(0.0036, abs=0.0001) - assert result.beta['CT'].point == approx(0.11554, abs=0.00001) - assert result.pvalues['CT'] == approx(0.1150, abs=0.0001) - assert result.beta['BW'].point == approx(0.03680, abs=0.00001) - assert result.pvalues['BW'] == approx(0.7326, abs=0.0001) - assert result.beta['N'].point == approx(-0.01203, abs=0.00001) - assert result.pvalues['N'] == approx(0.1394, abs=0.0001) - assert result.beta['PT'].point == approx(-0.22197, abs=0.00001) - assert result.pvalues['PT'] == approx(0.1035, abs=0.0001) + assert result.terms['(Intercept)'].beta.point == approx(-10.63398, abs=0.00001) + assert result.terms['(Intercept)'].pvalue == approx(0.0766, abs=0.0001) + assert result.terms['D'].beta.point == approx(0.22760, abs=0.00001) + assert result.terms['D'].pvalue == approx(0.0157, abs=0.0001) + assert result.terms['T1'].beta.point == approx(0.00525, abs=0.00001) + assert result.terms['T1'].pvalue == approx(0.8161, abs=0.0001) + assert result.terms['T2'].beta.point == approx(0.00561, abs=0.00001) + assert result.terms['T2'].pvalue == approx(0.2360, abs=0.0001) + assert result.terms['S'].beta.point == approx(0.00088369, abs=0.00000001) + assert result.terms['S'].pvalue < 0.0001 + assert result.terms['PR'].beta.point == approx(-0.10813, abs=0.00001) + assert result.terms['PR'].pvalue == approx(0.2094, abs=0.0001) + assert result.terms['NE'].beta.point == approx(0.25949, abs=0.00001) + assert result.terms['NE'].pvalue == approx(0.0036, abs=0.0001) + assert result.terms['CT'].beta.point == approx(0.11554, abs=0.00001) + assert result.terms['CT'].pvalue == approx(0.1150, abs=0.0001) + assert result.terms['BW'].beta.point == approx(0.03680, abs=0.00001) + assert result.terms['BW'].pvalue == approx(0.7326, abs=0.0001) + assert result.terms['N'].beta.point == approx(-0.01203, abs=0.00001) + assert result.terms['N'].pvalue == approx(0.1394, abs=0.0001) + assert result.terms['PT'].beta.point == approx(-0.22197, abs=0.00001) + assert result.terms['PT'].pvalue == approx(0.1035, abs=0.0001) def test_regress_logit_ol12_23(): """Compare yli.regress for Ott & Longnecker (2016) chapter 12.23""" @@ -112,12 +112,12 @@ def test_regress_logit_ol12_23(): assert lrtest_result.dof == 2 assert lrtest_result.pvalue == approx(0.0191, rel=0.02) - expbeta_fib = np.exp(result.beta['Fibrinogen']) + expbeta_fib = np.exp(result.terms['Fibrinogen'].beta) assert expbeta_fib.point == approx(6.756, rel=0.01) assert expbeta_fib.ci_lower == approx(1.007, rel=0.01) assert expbeta_fib.ci_upper == approx(45.308, rel=0.02) - expbeta_gam = np.exp(result.beta['GammaGlobulin']) + expbeta_gam = np.exp(result.terms['GammaGlobulin'].beta) assert expbeta_gam.point == approx(1.169, abs=0.001) assert expbeta_gam.ci_lower == approx(0.924, abs=0.001) assert expbeta_gam.ci_upper == approx(1.477, abs=0.001) @@ -133,14 +133,14 @@ def test_regress_penalisedlogit_kleinman(): result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False) assert result.dof_model == 1 - assert result.beta['(Intercept)'].point == approx(-2.280389) - assert result.beta['(Intercept)'].ci_lower == approx(-2.765427) - assert result.beta['(Intercept)'].ci_upper == approx(-1.851695) - assert result.pvalues['(Intercept)'] < 0.0001 - assert result.beta['Pred'].point == approx(5.993961) - assert result.beta['Pred'].ci_lower == approx(3.947048) - assert result.beta['Pred'].ci_upper == approx(10.852893) - assert result.pvalues['Pred'] < 0.0001 + assert result.terms['(Intercept)'].beta.point == approx(-2.280389) + assert result.terms['(Intercept)'].beta.ci_lower == approx(-2.765427) + assert result.terms['(Intercept)'].beta.ci_upper == approx(-1.851695) + assert result.terms['(Intercept)'].pvalue < 0.0001 + assert result.terms['Pred'].beta.point == approx(5.993961) + assert result.terms['Pred'].beta.ci_lower == approx(3.947048) + assert result.terms['Pred'].beta.ci_upper == approx(10.852893) + assert result.terms['Pred'].pvalue < 0.0001 lrtest_result = result.lrtest_null() assert lrtest_result.statistic == approx(78.95473) diff --git a/yli/regress.py b/yli/regress.py index 6d6dea4..34aa729 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -103,7 +103,7 @@ class RegressionResult: raw_result, full_name, model_name, fit_method, dep, nobs, dof_model, fitted_dt, - beta, pvalues, + terms, llf, llnull, dof_resid, rsquared, f_statistic, exp @@ -122,9 +122,8 @@ class RegressionResult: self.dof_model = dof_model self.fitted_dt = fitted_dt - # Regression coefficients - self.beta = beta - self.pvalues = pvalues + # Regression coefficients/p values + self.terms = terms # Model log-likelihood self.llf = llf @@ -165,7 +164,7 @@ class RegressionResult: """ # Get parameters required for AFBF - params = pd.Series({term.replace('[', '_').replace(']', '_'): beta.point for term, beta in self.beta.items()}) + params = pd.Series({term.raw_name.replace('[', '_').replace(']', '_'): term.beta.point for term in self.terms.values()}) cov = self.raw_result.cov_params() # Compute BF matrix @@ -240,12 +239,33 @@ class RegressionResult: # Render coefficients table out += ''.format('exp(β)' if self.exp else 'β') - for term, beta in self.beta.items(): - # Exponentiate if requested - if self.exp: - beta = np.exp(beta) - - out += ''.format(term, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(self.pvalues[term], html=True, nospace=True)) + for term_name, term in self.terms.items(): + if isinstance(term, SingleTerm): + # Single term + + # Exponentiate beta if requested + beta = term.beta + if self.exp: + beta = np.exp(beta) + + out += ''.format(term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(term.pvalue, html=True, nospace=True)) + elif isinstance(term, CategoricalTerm): + # Categorical term + out += ''.format(term_name) + + # Render reference category + out += ''.format(term.ref_category) + + # Loop over terms + for sub_term_name, sub_term in term.categories.items(): + # Exponentiate beta if requested + beta = sub_term.beta + if self.exp: + beta = np.exp(beta) + + out += ''.format(sub_term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(sub_term.pvalue, html=True, nospace=True)) + else: + raise Exception('Attempt to render unknown term type') out += '
{}(95% CI)p
{}{:.2f}({:.2f}{:.2f}){}
{}{:.2f}({:.2f}{:.2f}){}
{}
{}Ref.
{}{:.2f}({:.2f}{:.2f}){}
' @@ -273,13 +293,34 @@ class RegressionResult: # Render coefficients table table_data = [] - for term, beta in self.beta.items(): - # Exponentiate if requested - if self.exp: - beta = np.exp(estimate) - - # Add some extra padding - table_data.append([term + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(self.pvalues[term], html=False, nospace=True)]) + for term_name, term in self.terms.items(): + if isinstance(term, SingleTerm): + # Single term + + # Exponentiate beta if requested + beta = term.beta + if self.exp: + beta = np.exp(beta) + + # Add some extra padding + table_data.append([term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(term.pvalue, html=False, nospace=True)]) + elif isinstance(term, CategoricalTerm): + # Categorical term + table_data.append([term_name + ' ', '', '', '', '', '']) + + # Render reference category + table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', '']) + + # Loop over terms + for sub_term_name, sub_term in term.categories.items(): + # Exponentiate beta if requested + beta = sub_term.beta + if self.exp: + beta = np.exp(beta) + + table_data.append([sub_term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(sub_term.pvalue, html=False, nospace=True)]) + else: + raise Exception('Attempt to render unknown term type') table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p']) # U+E000 is in Private Use Area, mark middle of CI column table2_text = table2.as_text().replace(' \ue000 ', '(95% CI)') # Render heading in the right spot @@ -293,6 +334,27 @@ class RegressionResult: return out +class SingleTerm: + """ + A term in a RegressionResult which is a single term + + raw_name: The raw name of the term (e.g. in RegressionResult.raw_result data) + beta: An Estimate of the coefficient + pvalue: The p value + """ + + def __init__(self, raw_name, beta, pvalue): + self.raw_name = raw_name + self.beta = beta + self.pvalue = pvalue + +class CategoricalTerm: + """A group of terms in a RegressionResult corresponding to a categorical variable""" + + def __init__(self, categories, ref_category): + self.categories = categories + self.ref_category = ref_category + def regress( model_class, df, dep, formula, *, nan_policy='warn', exp=None @@ -328,8 +390,40 @@ def regress( result.exp = exp return result + # Process terms + terms = {} + confint = result.conf_int() - beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()} + + for raw_name, raw_beta in result.params.items(): + beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name]) + + # Rename terms + if raw_name == 'Intercept': + # Intercept term (single term) + term = '(Intercept)' + terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + elif '[T.' in raw_name: + # Categorical term + term = raw_name[:raw_name.index('[T.')] + category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')] + + if term.startswith('C('): + term = term[2:-1] + + # Add a new categorical term if not exists + if term not in terms: + # Try to guess the ref_category + # FIXME: This is a VERY brittle implementation!! + ref_category = sorted(df[term].unique())[0] + + terms[term] = CategoricalTerm({}, ref_category) + + terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + else: + # Single term + term = raw_name + terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) # Fit null model (for llnull) if hasattr(result, 'llnull'): @@ -346,7 +440,7 @@ def regress( result, 'Logistic Regression' if model_class is sm.Logit else '{} Regression'.format(model_class.__name__), model_class.__name__, header_dict['Method:'], dep, result.nobs, result.df_model, datetime.now(), - beta, result.pvalues, + terms, result.llf, llnull, getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), exp @@ -388,14 +482,14 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): # Fit the model model = ro.r('logistf(formula_, data=df)') - beta = {t: Estimate(b, ci0, ci1) for t, b, ci0, ci1 in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'])} - pvalues = {t: p for t, p in zip(model['terms'], model['prob'])} + # TODO: Handle categorical terms? + terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])} return RegressionResult( model, 'Penalised Logistic Regression', 'Logit', 'Penalised ML', self.endog_names, model['n'][0], model['df'][0], datetime.now(), - beta, pvalues, + terms, model['loglik'][0], model['loglik'][1], None, None, None, None # Set exp in regress()