Improve pretty printing of regression results

Pretty print categorical variables and show reference category
This commit is contained in:
RunasSudo 2022-10-14 21:57:22 +11:00
parent e8e97f6073
commit b2aaaabb0e
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
2 changed files with 155 additions and 61 deletions

View File

@ -38,13 +38,13 @@ def test_regress_ols_ol11_4():
assert result.ftest().pvalue < 0.0005
assert result.rsquared == approx(0.7429, abs=0.0001)
assert result.beta['Intercept'].point == approx(47.48, abs=0.01)
assert result.pvalues['Intercept'] < 0.0005
assert result.beta['SoilPh'].point == approx(-7.86, abs=0.01)
assert result.pvalues['SoilPh'] < 0.0005
assert result.terms['(Intercept)'].beta.point == approx(47.48, abs=0.01)
assert result.terms['(Intercept)'].pvalue < 0.0005
assert result.terms['SoilPh'].beta.point == approx(-7.86, abs=0.01)
assert result.terms['SoilPh'].pvalue < 0.0005
assert result.beta['SoilPh'].ci_lower == approx(-10.15, abs=0.01)
assert result.beta['SoilPh'].ci_upper == approx(-5.57, abs=0.01)
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
def test_regress_ols_ol13_5():
"""Compare yli.regress for Ott & Longnecker (2016) chapter 13.5"""
@ -72,28 +72,28 @@ def test_regress_ols_ol13_5():
assert result.ftest().pvalue < 0.00005
assert result.rsquared == approx(0.8635, abs=0.0001)
assert result.beta['Intercept'].point == approx(-10.63398, abs=0.00001)
assert result.pvalues['Intercept'] == approx(0.0766, abs=0.0001)
assert result.beta['D'].point == approx(0.22760, abs=0.00001)
assert result.pvalues['D'] == approx(0.0157, abs=0.0001)
assert result.beta['T1'].point == approx(0.00525, abs=0.00001)
assert result.pvalues['T1'] == approx(0.8161, abs=0.0001)
assert result.beta['T2'].point == approx(0.00561, abs=0.00001)
assert result.pvalues['T2'] == approx(0.2360, abs=0.0001)
assert result.beta['S'].point == approx(0.00088369, abs=0.00000001)
assert result.pvalues['S'] < 0.0001
assert result.beta['PR'].point == approx(-0.10813, abs=0.00001)
assert result.pvalues['PR'] == approx(0.2094, abs=0.0001)
assert result.beta['NE'].point == approx(0.25949, abs=0.00001)
assert result.pvalues['NE'] == approx(0.0036, abs=0.0001)
assert result.beta['CT'].point == approx(0.11554, abs=0.00001)
assert result.pvalues['CT'] == approx(0.1150, abs=0.0001)
assert result.beta['BW'].point == approx(0.03680, abs=0.00001)
assert result.pvalues['BW'] == approx(0.7326, abs=0.0001)
assert result.beta['N'].point == approx(-0.01203, abs=0.00001)
assert result.pvalues['N'] == approx(0.1394, abs=0.0001)
assert result.beta['PT'].point == approx(-0.22197, abs=0.00001)
assert result.pvalues['PT'] == approx(0.1035, abs=0.0001)
assert result.terms['(Intercept)'].beta.point == approx(-10.63398, abs=0.00001)
assert result.terms['(Intercept)'].pvalue == approx(0.0766, abs=0.0001)
assert result.terms['D'].beta.point == approx(0.22760, abs=0.00001)
assert result.terms['D'].pvalue == approx(0.0157, abs=0.0001)
assert result.terms['T1'].beta.point == approx(0.00525, abs=0.00001)
assert result.terms['T1'].pvalue == approx(0.8161, abs=0.0001)
assert result.terms['T2'].beta.point == approx(0.00561, abs=0.00001)
assert result.terms['T2'].pvalue == approx(0.2360, abs=0.0001)
assert result.terms['S'].beta.point == approx(0.00088369, abs=0.00000001)
assert result.terms['S'].pvalue < 0.0001
assert result.terms['PR'].beta.point == approx(-0.10813, abs=0.00001)
assert result.terms['PR'].pvalue == approx(0.2094, abs=0.0001)
assert result.terms['NE'].beta.point == approx(0.25949, abs=0.00001)
assert result.terms['NE'].pvalue == approx(0.0036, abs=0.0001)
assert result.terms['CT'].beta.point == approx(0.11554, abs=0.00001)
assert result.terms['CT'].pvalue == approx(0.1150, abs=0.0001)
assert result.terms['BW'].beta.point == approx(0.03680, abs=0.00001)
assert result.terms['BW'].pvalue == approx(0.7326, abs=0.0001)
assert result.terms['N'].beta.point == approx(-0.01203, abs=0.00001)
assert result.terms['N'].pvalue == approx(0.1394, abs=0.0001)
assert result.terms['PT'].beta.point == approx(-0.22197, abs=0.00001)
assert result.terms['PT'].pvalue == approx(0.1035, abs=0.0001)
def test_regress_logit_ol12_23():
"""Compare yli.regress for Ott & Longnecker (2016) chapter 12.23"""
@ -112,12 +112,12 @@ def test_regress_logit_ol12_23():
assert lrtest_result.dof == 2
assert lrtest_result.pvalue == approx(0.0191, rel=0.02)
expbeta_fib = np.exp(result.beta['Fibrinogen'])
expbeta_fib = np.exp(result.terms['Fibrinogen'].beta)
assert expbeta_fib.point == approx(6.756, rel=0.01)
assert expbeta_fib.ci_lower == approx(1.007, rel=0.01)
assert expbeta_fib.ci_upper == approx(45.308, rel=0.02)
expbeta_gam = np.exp(result.beta['GammaGlobulin'])
expbeta_gam = np.exp(result.terms['GammaGlobulin'].beta)
assert expbeta_gam.point == approx(1.169, abs=0.001)
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
@ -133,14 +133,14 @@ def test_regress_penalisedlogit_kleinman():
result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
assert result.dof_model == 1
assert result.beta['(Intercept)'].point == approx(-2.280389)
assert result.beta['(Intercept)'].ci_lower == approx(-2.765427)
assert result.beta['(Intercept)'].ci_upper == approx(-1.851695)
assert result.pvalues['(Intercept)'] < 0.0001
assert result.beta['Pred'].point == approx(5.993961)
assert result.beta['Pred'].ci_lower == approx(3.947048)
assert result.beta['Pred'].ci_upper == approx(10.852893)
assert result.pvalues['Pred'] < 0.0001
assert result.terms['(Intercept)'].beta.point == approx(-2.280389)
assert result.terms['(Intercept)'].beta.ci_lower == approx(-2.765427)
assert result.terms['(Intercept)'].beta.ci_upper == approx(-1.851695)
assert result.terms['(Intercept)'].pvalue < 0.0001
assert result.terms['Pred'].beta.point == approx(5.993961)
assert result.terms['Pred'].beta.ci_lower == approx(3.947048)
assert result.terms['Pred'].beta.ci_upper == approx(10.852893)
assert result.terms['Pred'].pvalue < 0.0001
lrtest_result = result.lrtest_null()
assert lrtest_result.statistic == approx(78.95473)

View File

@ -103,7 +103,7 @@ class RegressionResult:
raw_result,
full_name, model_name, fit_method,
dep, nobs, dof_model, fitted_dt,
beta, pvalues,
terms,
llf, llnull,
dof_resid, rsquared, f_statistic,
exp
@ -122,9 +122,8 @@ class RegressionResult:
self.dof_model = dof_model
self.fitted_dt = fitted_dt
# Regression coefficients
self.beta = beta
self.pvalues = pvalues
# Regression coefficients/p values
self.terms = terms
# Model log-likelihood
self.llf = llf
@ -165,7 +164,7 @@ class RegressionResult:
"""
# Get parameters required for AFBF
params = pd.Series({term.replace('[', '_').replace(']', '_'): beta.point for term, beta in self.beta.items()})
params = pd.Series({term.raw_name.replace('[', '_').replace(']', '_'): term.beta.point for term in self.terms.values()})
cov = self.raw_result.cov_params()
# Compute BF matrix
@ -240,12 +239,33 @@ class RegressionResult:
# Render coefficients table
out += '<table><tr><th></th><th style="text-align:center">{}</th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr>'.format('exp(<i>β</i>)' if self.exp else '<i>β</i>')
for term, beta in self.beta.items():
# Exponentiate if requested
if self.exp:
beta = np.exp(beta)
out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(self.pvalues[term], html=True, nospace=True))
for term_name, term in self.terms.items():
if isinstance(term, SingleTerm):
# Single term
# Exponentiate beta if requested
beta = term.beta
if self.exp:
beta = np.exp(beta)
out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(term.pvalue, html=True, nospace=True))
elif isinstance(term, CategoricalTerm):
# Categorical term
out += '<tr><th>{}</th><td></td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term_name)
# Render reference category
out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
# Loop over terms
for sub_term_name, sub_term in term.categories.items():
# Exponentiate beta if requested
beta = sub_term.beta
if self.exp:
beta = np.exp(beta)
out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(sub_term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(sub_term.pvalue, html=True, nospace=True))
else:
raise Exception('Attempt to render unknown term type')
out += '</table>'
@ -273,13 +293,34 @@ class RegressionResult:
# Render coefficients table
table_data = []
for term, beta in self.beta.items():
# Exponentiate if requested
if self.exp:
beta = np.exp(estimate)
# Add some extra padding
table_data.append([term + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(self.pvalues[term], html=False, nospace=True)])
for term_name, term in self.terms.items():
if isinstance(term, SingleTerm):
# Single term
# Exponentiate beta if requested
beta = term.beta
if self.exp:
beta = np.exp(beta)
# Add some extra padding
table_data.append([term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(term.pvalue, html=False, nospace=True)])
elif isinstance(term, CategoricalTerm):
# Categorical term
table_data.append([term_name + ' ', '', '', '', '', ''])
# Render reference category
table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', ''])
# Loop over terms
for sub_term_name, sub_term in term.categories.items():
# Exponentiate beta if requested
beta = sub_term.beta
if self.exp:
beta = np.exp(beta)
table_data.append([sub_term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(sub_term.pvalue, html=False, nospace=True)])
else:
raise Exception('Attempt to render unknown term type')
table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p']) # U+E000 is in Private Use Area, mark middle of CI column
table2_text = table2.as_text().replace(' \ue000 ', '(95% CI)') # Render heading in the right spot
@ -293,6 +334,27 @@ class RegressionResult:
return out
class SingleTerm:
"""
A term in a RegressionResult which is a single term
raw_name: The raw name of the term (e.g. in RegressionResult.raw_result data)
beta: An Estimate of the coefficient
pvalue: The p value
"""
def __init__(self, raw_name, beta, pvalue):
self.raw_name = raw_name
self.beta = beta
self.pvalue = pvalue
class CategoricalTerm:
"""A group of terms in a RegressionResult corresponding to a categorical variable"""
def __init__(self, categories, ref_category):
self.categories = categories
self.ref_category = ref_category
def regress(
model_class, df, dep, formula, *,
nan_policy='warn', exp=None
@ -328,8 +390,40 @@ def regress(
result.exp = exp
return result
# Process terms
terms = {}
confint = result.conf_int()
beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()}
for raw_name, raw_beta in result.params.items():
beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name])
# Rename terms
if raw_name == 'Intercept':
# Intercept term (single term)
term = '(Intercept)'
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
elif '[T.' in raw_name:
# Categorical term
term = raw_name[:raw_name.index('[T.')]
category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
if term.startswith('C('):
term = term[2:-1]
# Add a new categorical term if not exists
if term not in terms:
# Try to guess the ref_category
# FIXME: This is a VERY brittle implementation!!
ref_category = sorted(df[term].unique())[0]
terms[term] = CategoricalTerm({}, ref_category)
terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
else:
# Single term
term = raw_name
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
# Fit null model (for llnull)
if hasattr(result, 'llnull'):
@ -346,7 +440,7 @@ def regress(
result,
'Logistic Regression' if model_class is sm.Logit else '{} Regression'.format(model_class.__name__), model_class.__name__, header_dict['Method:'],
dep, result.nobs, result.df_model, datetime.now(),
beta, result.pvalues,
terms,
result.llf, llnull,
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
exp
@ -388,14 +482,14 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
# Fit the model
model = ro.r('logistf(formula_, data=df)')
beta = {t: Estimate(b, ci0, ci1) for t, b, ci0, ci1 in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'])}
pvalues = {t: p for t, p in zip(model['terms'], model['prob'])}
# TODO: Handle categorical terms?
terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
return RegressionResult(
model,
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
self.endog_names, model['n'][0], model['df'][0], datetime.now(),
beta, pvalues,
terms,
model['loglik'][0], model['loglik'][1],
None, None, None,
None # Set exp in regress()