Improve pretty printing of regression results
Pretty print categorical variables and show reference category
This commit is contained in:
parent
e8e97f6073
commit
b2aaaabb0e
@ -38,13 +38,13 @@ def test_regress_ols_ol11_4():
|
|||||||
assert result.ftest().pvalue < 0.0005
|
assert result.ftest().pvalue < 0.0005
|
||||||
assert result.rsquared == approx(0.7429, abs=0.0001)
|
assert result.rsquared == approx(0.7429, abs=0.0001)
|
||||||
|
|
||||||
assert result.beta['Intercept'].point == approx(47.48, abs=0.01)
|
assert result.terms['(Intercept)'].beta.point == approx(47.48, abs=0.01)
|
||||||
assert result.pvalues['Intercept'] < 0.0005
|
assert result.terms['(Intercept)'].pvalue < 0.0005
|
||||||
assert result.beta['SoilPh'].point == approx(-7.86, abs=0.01)
|
assert result.terms['SoilPh'].beta.point == approx(-7.86, abs=0.01)
|
||||||
assert result.pvalues['SoilPh'] < 0.0005
|
assert result.terms['SoilPh'].pvalue < 0.0005
|
||||||
|
|
||||||
assert result.beta['SoilPh'].ci_lower == approx(-10.15, abs=0.01)
|
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
|
||||||
assert result.beta['SoilPh'].ci_upper == approx(-5.57, abs=0.01)
|
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
|
||||||
|
|
||||||
def test_regress_ols_ol13_5():
|
def test_regress_ols_ol13_5():
|
||||||
"""Compare yli.regress for Ott & Longnecker (2016) chapter 13.5"""
|
"""Compare yli.regress for Ott & Longnecker (2016) chapter 13.5"""
|
||||||
@ -72,28 +72,28 @@ def test_regress_ols_ol13_5():
|
|||||||
assert result.ftest().pvalue < 0.00005
|
assert result.ftest().pvalue < 0.00005
|
||||||
assert result.rsquared == approx(0.8635, abs=0.0001)
|
assert result.rsquared == approx(0.8635, abs=0.0001)
|
||||||
|
|
||||||
assert result.beta['Intercept'].point == approx(-10.63398, abs=0.00001)
|
assert result.terms['(Intercept)'].beta.point == approx(-10.63398, abs=0.00001)
|
||||||
assert result.pvalues['Intercept'] == approx(0.0766, abs=0.0001)
|
assert result.terms['(Intercept)'].pvalue == approx(0.0766, abs=0.0001)
|
||||||
assert result.beta['D'].point == approx(0.22760, abs=0.00001)
|
assert result.terms['D'].beta.point == approx(0.22760, abs=0.00001)
|
||||||
assert result.pvalues['D'] == approx(0.0157, abs=0.0001)
|
assert result.terms['D'].pvalue == approx(0.0157, abs=0.0001)
|
||||||
assert result.beta['T1'].point == approx(0.00525, abs=0.00001)
|
assert result.terms['T1'].beta.point == approx(0.00525, abs=0.00001)
|
||||||
assert result.pvalues['T1'] == approx(0.8161, abs=0.0001)
|
assert result.terms['T1'].pvalue == approx(0.8161, abs=0.0001)
|
||||||
assert result.beta['T2'].point == approx(0.00561, abs=0.00001)
|
assert result.terms['T2'].beta.point == approx(0.00561, abs=0.00001)
|
||||||
assert result.pvalues['T2'] == approx(0.2360, abs=0.0001)
|
assert result.terms['T2'].pvalue == approx(0.2360, abs=0.0001)
|
||||||
assert result.beta['S'].point == approx(0.00088369, abs=0.00000001)
|
assert result.terms['S'].beta.point == approx(0.00088369, abs=0.00000001)
|
||||||
assert result.pvalues['S'] < 0.0001
|
assert result.terms['S'].pvalue < 0.0001
|
||||||
assert result.beta['PR'].point == approx(-0.10813, abs=0.00001)
|
assert result.terms['PR'].beta.point == approx(-0.10813, abs=0.00001)
|
||||||
assert result.pvalues['PR'] == approx(0.2094, abs=0.0001)
|
assert result.terms['PR'].pvalue == approx(0.2094, abs=0.0001)
|
||||||
assert result.beta['NE'].point == approx(0.25949, abs=0.00001)
|
assert result.terms['NE'].beta.point == approx(0.25949, abs=0.00001)
|
||||||
assert result.pvalues['NE'] == approx(0.0036, abs=0.0001)
|
assert result.terms['NE'].pvalue == approx(0.0036, abs=0.0001)
|
||||||
assert result.beta['CT'].point == approx(0.11554, abs=0.00001)
|
assert result.terms['CT'].beta.point == approx(0.11554, abs=0.00001)
|
||||||
assert result.pvalues['CT'] == approx(0.1150, abs=0.0001)
|
assert result.terms['CT'].pvalue == approx(0.1150, abs=0.0001)
|
||||||
assert result.beta['BW'].point == approx(0.03680, abs=0.00001)
|
assert result.terms['BW'].beta.point == approx(0.03680, abs=0.00001)
|
||||||
assert result.pvalues['BW'] == approx(0.7326, abs=0.0001)
|
assert result.terms['BW'].pvalue == approx(0.7326, abs=0.0001)
|
||||||
assert result.beta['N'].point == approx(-0.01203, abs=0.00001)
|
assert result.terms['N'].beta.point == approx(-0.01203, abs=0.00001)
|
||||||
assert result.pvalues['N'] == approx(0.1394, abs=0.0001)
|
assert result.terms['N'].pvalue == approx(0.1394, abs=0.0001)
|
||||||
assert result.beta['PT'].point == approx(-0.22197, abs=0.00001)
|
assert result.terms['PT'].beta.point == approx(-0.22197, abs=0.00001)
|
||||||
assert result.pvalues['PT'] == approx(0.1035, abs=0.0001)
|
assert result.terms['PT'].pvalue == approx(0.1035, abs=0.0001)
|
||||||
|
|
||||||
def test_regress_logit_ol12_23():
|
def test_regress_logit_ol12_23():
|
||||||
"""Compare yli.regress for Ott & Longnecker (2016) chapter 12.23"""
|
"""Compare yli.regress for Ott & Longnecker (2016) chapter 12.23"""
|
||||||
@ -112,12 +112,12 @@ def test_regress_logit_ol12_23():
|
|||||||
assert lrtest_result.dof == 2
|
assert lrtest_result.dof == 2
|
||||||
assert lrtest_result.pvalue == approx(0.0191, rel=0.02)
|
assert lrtest_result.pvalue == approx(0.0191, rel=0.02)
|
||||||
|
|
||||||
expbeta_fib = np.exp(result.beta['Fibrinogen'])
|
expbeta_fib = np.exp(result.terms['Fibrinogen'].beta)
|
||||||
assert expbeta_fib.point == approx(6.756, rel=0.01)
|
assert expbeta_fib.point == approx(6.756, rel=0.01)
|
||||||
assert expbeta_fib.ci_lower == approx(1.007, rel=0.01)
|
assert expbeta_fib.ci_lower == approx(1.007, rel=0.01)
|
||||||
assert expbeta_fib.ci_upper == approx(45.308, rel=0.02)
|
assert expbeta_fib.ci_upper == approx(45.308, rel=0.02)
|
||||||
|
|
||||||
expbeta_gam = np.exp(result.beta['GammaGlobulin'])
|
expbeta_gam = np.exp(result.terms['GammaGlobulin'].beta)
|
||||||
assert expbeta_gam.point == approx(1.169, abs=0.001)
|
assert expbeta_gam.point == approx(1.169, abs=0.001)
|
||||||
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
|
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
|
||||||
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
|
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
|
||||||
@ -133,14 +133,14 @@ def test_regress_penalisedlogit_kleinman():
|
|||||||
result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
|
result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
|
||||||
|
|
||||||
assert result.dof_model == 1
|
assert result.dof_model == 1
|
||||||
assert result.beta['(Intercept)'].point == approx(-2.280389)
|
assert result.terms['(Intercept)'].beta.point == approx(-2.280389)
|
||||||
assert result.beta['(Intercept)'].ci_lower == approx(-2.765427)
|
assert result.terms['(Intercept)'].beta.ci_lower == approx(-2.765427)
|
||||||
assert result.beta['(Intercept)'].ci_upper == approx(-1.851695)
|
assert result.terms['(Intercept)'].beta.ci_upper == approx(-1.851695)
|
||||||
assert result.pvalues['(Intercept)'] < 0.0001
|
assert result.terms['(Intercept)'].pvalue < 0.0001
|
||||||
assert result.beta['Pred'].point == approx(5.993961)
|
assert result.terms['Pred'].beta.point == approx(5.993961)
|
||||||
assert result.beta['Pred'].ci_lower == approx(3.947048)
|
assert result.terms['Pred'].beta.ci_lower == approx(3.947048)
|
||||||
assert result.beta['Pred'].ci_upper == approx(10.852893)
|
assert result.terms['Pred'].beta.ci_upper == approx(10.852893)
|
||||||
assert result.pvalues['Pred'] < 0.0001
|
assert result.terms['Pred'].pvalue < 0.0001
|
||||||
|
|
||||||
lrtest_result = result.lrtest_null()
|
lrtest_result = result.lrtest_null()
|
||||||
assert lrtest_result.statistic == approx(78.95473)
|
assert lrtest_result.statistic == approx(78.95473)
|
||||||
|
136
yli/regress.py
136
yli/regress.py
@ -103,7 +103,7 @@ class RegressionResult:
|
|||||||
raw_result,
|
raw_result,
|
||||||
full_name, model_name, fit_method,
|
full_name, model_name, fit_method,
|
||||||
dep, nobs, dof_model, fitted_dt,
|
dep, nobs, dof_model, fitted_dt,
|
||||||
beta, pvalues,
|
terms,
|
||||||
llf, llnull,
|
llf, llnull,
|
||||||
dof_resid, rsquared, f_statistic,
|
dof_resid, rsquared, f_statistic,
|
||||||
exp
|
exp
|
||||||
@ -122,9 +122,8 @@ class RegressionResult:
|
|||||||
self.dof_model = dof_model
|
self.dof_model = dof_model
|
||||||
self.fitted_dt = fitted_dt
|
self.fitted_dt = fitted_dt
|
||||||
|
|
||||||
# Regression coefficients
|
# Regression coefficients/p values
|
||||||
self.beta = beta
|
self.terms = terms
|
||||||
self.pvalues = pvalues
|
|
||||||
|
|
||||||
# Model log-likelihood
|
# Model log-likelihood
|
||||||
self.llf = llf
|
self.llf = llf
|
||||||
@ -165,7 +164,7 @@ class RegressionResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Get parameters required for AFBF
|
# Get parameters required for AFBF
|
||||||
params = pd.Series({term.replace('[', '_').replace(']', '_'): beta.point for term, beta in self.beta.items()})
|
params = pd.Series({term.raw_name.replace('[', '_').replace(']', '_'): term.beta.point for term in self.terms.values()})
|
||||||
cov = self.raw_result.cov_params()
|
cov = self.raw_result.cov_params()
|
||||||
|
|
||||||
# Compute BF matrix
|
# Compute BF matrix
|
||||||
@ -240,12 +239,33 @@ class RegressionResult:
|
|||||||
# Render coefficients table
|
# Render coefficients table
|
||||||
out += '<table><tr><th></th><th style="text-align:center">{}</th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr>'.format('exp(<i>β</i>)' if self.exp else '<i>β</i>')
|
out += '<table><tr><th></th><th style="text-align:center">{}</th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr>'.format('exp(<i>β</i>)' if self.exp else '<i>β</i>')
|
||||||
|
|
||||||
for term, beta in self.beta.items():
|
for term_name, term in self.terms.items():
|
||||||
# Exponentiate if requested
|
if isinstance(term, SingleTerm):
|
||||||
if self.exp:
|
# Single term
|
||||||
beta = np.exp(beta)
|
|
||||||
|
|
||||||
out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(self.pvalues[term], html=True, nospace=True))
|
# Exponentiate beta if requested
|
||||||
|
beta = term.beta
|
||||||
|
if self.exp:
|
||||||
|
beta = np.exp(beta)
|
||||||
|
|
||||||
|
out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(term.pvalue, html=True, nospace=True))
|
||||||
|
elif isinstance(term, CategoricalTerm):
|
||||||
|
# Categorical term
|
||||||
|
out += '<tr><th>{}</th><td></td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term_name)
|
||||||
|
|
||||||
|
# Render reference category
|
||||||
|
out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
|
||||||
|
|
||||||
|
# Loop over terms
|
||||||
|
for sub_term_name, sub_term in term.categories.items():
|
||||||
|
# Exponentiate beta if requested
|
||||||
|
beta = sub_term.beta
|
||||||
|
if self.exp:
|
||||||
|
beta = np.exp(beta)
|
||||||
|
|
||||||
|
out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(sub_term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(sub_term.pvalue, html=True, nospace=True))
|
||||||
|
else:
|
||||||
|
raise Exception('Attempt to render unknown term type')
|
||||||
|
|
||||||
out += '</table>'
|
out += '</table>'
|
||||||
|
|
||||||
@ -273,13 +293,34 @@ class RegressionResult:
|
|||||||
# Render coefficients table
|
# Render coefficients table
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
for term, beta in self.beta.items():
|
for term_name, term in self.terms.items():
|
||||||
# Exponentiate if requested
|
if isinstance(term, SingleTerm):
|
||||||
if self.exp:
|
# Single term
|
||||||
beta = np.exp(estimate)
|
|
||||||
|
|
||||||
# Add some extra padding
|
# Exponentiate beta if requested
|
||||||
table_data.append([term + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(self.pvalues[term], html=False, nospace=True)])
|
beta = term.beta
|
||||||
|
if self.exp:
|
||||||
|
beta = np.exp(beta)
|
||||||
|
|
||||||
|
# Add some extra padding
|
||||||
|
table_data.append([term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(term.pvalue, html=False, nospace=True)])
|
||||||
|
elif isinstance(term, CategoricalTerm):
|
||||||
|
# Categorical term
|
||||||
|
table_data.append([term_name + ' ', '', '', '', '', ''])
|
||||||
|
|
||||||
|
# Render reference category
|
||||||
|
table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', ''])
|
||||||
|
|
||||||
|
# Loop over terms
|
||||||
|
for sub_term_name, sub_term in term.categories.items():
|
||||||
|
# Exponentiate beta if requested
|
||||||
|
beta = sub_term.beta
|
||||||
|
if self.exp:
|
||||||
|
beta = np.exp(beta)
|
||||||
|
|
||||||
|
table_data.append([sub_term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(sub_term.pvalue, html=False, nospace=True)])
|
||||||
|
else:
|
||||||
|
raise Exception('Attempt to render unknown term type')
|
||||||
|
|
||||||
table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p']) # U+E000 is in Private Use Area, mark middle of CI column
|
table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p']) # U+E000 is in Private Use Area, mark middle of CI column
|
||||||
table2_text = table2.as_text().replace(' \ue000 ', '(95% CI)') # Render heading in the right spot
|
table2_text = table2.as_text().replace(' \ue000 ', '(95% CI)') # Render heading in the right spot
|
||||||
@ -293,6 +334,27 @@ class RegressionResult:
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
class SingleTerm:
|
||||||
|
"""
|
||||||
|
A term in a RegressionResult which is a single term
|
||||||
|
|
||||||
|
raw_name: The raw name of the term (e.g. in RegressionResult.raw_result data)
|
||||||
|
beta: An Estimate of the coefficient
|
||||||
|
pvalue: The p value
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, raw_name, beta, pvalue):
|
||||||
|
self.raw_name = raw_name
|
||||||
|
self.beta = beta
|
||||||
|
self.pvalue = pvalue
|
||||||
|
|
||||||
|
class CategoricalTerm:
|
||||||
|
"""A group of terms in a RegressionResult corresponding to a categorical variable"""
|
||||||
|
|
||||||
|
def __init__(self, categories, ref_category):
|
||||||
|
self.categories = categories
|
||||||
|
self.ref_category = ref_category
|
||||||
|
|
||||||
def regress(
|
def regress(
|
||||||
model_class, df, dep, formula, *,
|
model_class, df, dep, formula, *,
|
||||||
nan_policy='warn', exp=None
|
nan_policy='warn', exp=None
|
||||||
@ -328,8 +390,40 @@ def regress(
|
|||||||
result.exp = exp
|
result.exp = exp
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# Process terms
|
||||||
|
terms = {}
|
||||||
|
|
||||||
confint = result.conf_int()
|
confint = result.conf_int()
|
||||||
beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()}
|
|
||||||
|
for raw_name, raw_beta in result.params.items():
|
||||||
|
beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name])
|
||||||
|
|
||||||
|
# Rename terms
|
||||||
|
if raw_name == 'Intercept':
|
||||||
|
# Intercept term (single term)
|
||||||
|
term = '(Intercept)'
|
||||||
|
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
|
elif '[T.' in raw_name:
|
||||||
|
# Categorical term
|
||||||
|
term = raw_name[:raw_name.index('[T.')]
|
||||||
|
category = raw_name[raw_name.index('[T.')+3:raw_name.index(']')]
|
||||||
|
|
||||||
|
if term.startswith('C('):
|
||||||
|
term = term[2:-1]
|
||||||
|
|
||||||
|
# Add a new categorical term if not exists
|
||||||
|
if term not in terms:
|
||||||
|
# Try to guess the ref_category
|
||||||
|
# FIXME: This is a VERY brittle implementation!!
|
||||||
|
ref_category = sorted(df[term].unique())[0]
|
||||||
|
|
||||||
|
terms[term] = CategoricalTerm({}, ref_category)
|
||||||
|
|
||||||
|
terms[term].categories[category] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
|
else:
|
||||||
|
# Single term
|
||||||
|
term = raw_name
|
||||||
|
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||||
|
|
||||||
# Fit null model (for llnull)
|
# Fit null model (for llnull)
|
||||||
if hasattr(result, 'llnull'):
|
if hasattr(result, 'llnull'):
|
||||||
@ -346,7 +440,7 @@ def regress(
|
|||||||
result,
|
result,
|
||||||
'Logistic Regression' if model_class is sm.Logit else '{} Regression'.format(model_class.__name__), model_class.__name__, header_dict['Method:'],
|
'Logistic Regression' if model_class is sm.Logit else '{} Regression'.format(model_class.__name__), model_class.__name__, header_dict['Method:'],
|
||||||
dep, result.nobs, result.df_model, datetime.now(),
|
dep, result.nobs, result.df_model, datetime.now(),
|
||||||
beta, result.pvalues,
|
terms,
|
||||||
result.llf, llnull,
|
result.llf, llnull,
|
||||||
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
|
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
|
||||||
exp
|
exp
|
||||||
@ -388,14 +482,14 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
|
|||||||
# Fit the model
|
# Fit the model
|
||||||
model = ro.r('logistf(formula_, data=df)')
|
model = ro.r('logistf(formula_, data=df)')
|
||||||
|
|
||||||
beta = {t: Estimate(b, ci0, ci1) for t, b, ci0, ci1 in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'])}
|
# TODO: Handle categorical terms?
|
||||||
pvalues = {t: p for t, p in zip(model['terms'], model['prob'])}
|
terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
|
||||||
|
|
||||||
return RegressionResult(
|
return RegressionResult(
|
||||||
model,
|
model,
|
||||||
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
|
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
|
||||||
self.endog_names, model['n'][0], model['df'][0], datetime.now(),
|
self.endog_names, model['n'][0], model['df'][0], datetime.now(),
|
||||||
beta, pvalues,
|
terms,
|
||||||
model['loglik'][0], model['loglik'][1],
|
model['loglik'][0], model['loglik'][1],
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None # Set exp in regress()
|
None # Set exp in regress()
|
||||||
|
Loading…
Reference in New Issue
Block a user