Implement regression helpers

2022-10-13 15:57:56 +11:00 · 2022-10-13 15:57:56 +11:00 · 987520e700
commit 987520e700
parent e1051d8aff
3 changed files with 434 additions and 0 deletions
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -0,0 +1,123 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from pytest import approx
+
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+
+import yli
+
+def test_regress_ols_ol11_4():
+	"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
+	
+	df = pd.DataFrame({
+		'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
+		'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
+	})
+	
+	result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
+	
+	assert result.dof_model == 1
+	assert result.dof_resid == 18
+	assert result.f_statistic == approx(52.01, abs=0.01)
+	assert result.ftest().pvalue < 0.0005
+	assert result.rsquared == approx(0.7429, abs=0.0001)
+	
+	assert result.beta['Intercept'].point == approx(47.48, abs=0.01)
+	assert result.pvalues['Intercept'] < 0.0005
+	assert result.beta['SoilPh'].point == approx(-7.86, abs=0.01)
+	assert result.pvalues['SoilPh'] < 0.0005
+	
+	assert result.beta['SoilPh'].ci_lower == approx(-10.15, abs=0.01)
+	assert result.beta['SoilPh'].ci_upper == approx(-5.57, abs=0.01)
+
+def test_regress_ols_ol13_5():
+	"""Compare yli.regress for Ott & Longnecker (2016) chapter 13.5"""
+	
+	df = pd.DataFrame({
+		'C': [460.05, 452.99, 443.22, 652.32, 642.23, 345.39, 272.37, 317.21, 457.12, 690.19, 350.63, 402.59, 412.18, 495.58, 394.36, 423.32, 712.27, 289.66, 881.24, 490.88, 567.79, 665.99, 621.45, 608.8, 473.64, 697.14, 207.51, 288.48, 284.88, 280.36, 217.38, 270.71],
+		'D': [68.58, 67.33, 67.33, 68, 68, 67.92, 68.17, 68.42, 68.42, 68.33, 68.58, 68.75, 68.42, 68.92, 68.92, 68.42, 69.5, 68.42, 69.17, 68.92, 68.75, 70.92, 69.67, 70.08, 70.42, 71.08, 67.25, 67.17, 67.83, 67.83, 67.25, 67.83],
+		'T1': [14, 10, 10, 11, 11, 13, 12, 14, 15, 12, 12, 13, 15, 17, 13, 11, 18, 15, 15, 16, 11, 22, 16, 19, 19, 20, 13, 9, 12, 12, 13, 7],
+		'T2': [46, 73, 85, 67, 78, 51, 50, 59, 55, 71, 64, 47, 62, 52, 65, 67, 60, 76, 67, 59, 70, 57, 59, 58, 44, 57, 63, 48, 63, 71, 72, 80],
+		'S': [687, 1065, 1065, 1065, 1065, 514, 822, 457, 822, 792, 560, 790, 530, 1050, 850, 778, 845, 530, 1090, 1050, 913, 828, 786, 821, 538, 1130, 745, 821, 886, 886, 745, 886],
+		'PR': [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1],
+		'NE': [1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+		'CT': [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
+		'BW': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
+		'N': [14, 1, 1, 12, 12, 3, 5, 1, 5, 2, 3, 6, 2, 7, 16, 3, 17, 2, 1, 8, 15, 20, 18, 3, 19, 21, 8, 7, 11, 11, 8, 11],
+		'PT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+	})
+	df['LNC'] = np.log(df['C'])
+	
+	result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
+	
+	assert result.dof_model == 10
+	assert result.dof_resid == 21
+	assert result.f_statistic == approx(13.28, abs=0.01)
+	assert result.ftest().pvalue < 0.00005
+	assert result.rsquared == approx(0.8635, abs=0.0001)
+	
+	assert result.beta['Intercept'].point == approx(-10.63398, abs=0.00001)
+	assert result.pvalues['Intercept'] == approx(0.0766, abs=0.0001)
+	assert result.beta['D'].point == approx(0.22760, abs=0.00001)
+	assert result.pvalues['D'] == approx(0.0157, abs=0.0001)
+	assert result.beta['T1'].point == approx(0.00525, abs=0.00001)
+	assert result.pvalues['T1'] == approx(0.8161, abs=0.0001)
+	assert result.beta['T2'].point == approx(0.00561, abs=0.00001)
+	assert result.pvalues['T2'] == approx(0.2360, abs=0.0001)
+	assert result.beta['S'].point == approx(0.00088369, abs=0.00000001)
+	assert result.pvalues['S'] < 0.0001
+	assert result.beta['PR'].point == approx(-0.10813, abs=0.00001)
+	assert result.pvalues['PR'] == approx(0.2094, abs=0.0001)
+	assert result.beta['NE'].point == approx(0.25949, abs=0.00001)
+	assert result.pvalues['NE'] == approx(0.0036, abs=0.0001)
+	assert result.beta['CT'].point == approx(0.11554, abs=0.00001)
+	assert result.pvalues['CT'] == approx(0.1150, abs=0.0001)
+	assert result.beta['BW'].point == approx(0.03680, abs=0.00001)
+	assert result.pvalues['BW'] == approx(0.7326, abs=0.0001)
+	assert result.beta['N'].point == approx(-0.01203, abs=0.00001)
+	assert result.pvalues['N'] == approx(0.1394, abs=0.0001)
+	assert result.beta['PT'].point == approx(-0.22197, abs=0.00001)
+	assert result.pvalues['PT'] == approx(0.1035, abs=0.0001)
+
+def test_regress_logit_ol12_23():
+	"""Compare yli.regress for Ott & Longnecker (2016) chapter 12.23"""
+	
+	df = pd.DataFrame({
+		'Unhealthy': [False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, True, False, False],
+		'Fibrinogen': [2.52, 2.46, 2.29, 3.15, 2.88, 2.29, 2.99, 2.38, 2.56, 3.22, 2.35, 3.53, 2.65, 2.15, 3.32, 2.23, 2.19, 2.21, 5.06, 2.68, 2.09, 2.54, 2.18, 2.68, 3.41, 3.15, 3.35, 2.60, 2.28, 3.93, 2.60, 3.34],
+		'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
+	})
+	
+	result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	
+	# Some numerical differences as intercept term is very negative
+	lrtest_result = result.lrtest_null()
+	assert lrtest_result.statistic == approx(7.9138, rel=0.01)
+	assert lrtest_result.dof == 2
+	assert lrtest_result.pvalue == approx(0.0191, rel=0.02)
+	
+	expbeta_fib = np.exp(result.beta['Fibrinogen'])
+	assert expbeta_fib.point == approx(6.756, rel=0.01)
+	assert expbeta_fib.ci_lower == approx(1.007, rel=0.01)
+	assert expbeta_fib.ci_upper == approx(45.308, rel=0.02)
+	
+	expbeta_gam = np.exp(result.beta['GammaGlobulin'])
+	assert expbeta_gam.point == approx(1.169, abs=0.001)
+	assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
+	assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
--- a/yli/init.py
+++ b/yli/init.py
@ -16,6 +16,7 @@

 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
+from .regress import regress
 from .sig_tests import chi2, mannwhitney, ttest_ind

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -0,0 +1,310 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+import patsy
+from scipy import stats
+import statsmodels.api as sm
+from statsmodels.iolib.table import SimpleTable
+
+from datetime import datetime
+import itertools
+
+from .utils import Estimate, check_nan, fmt_p_html, fmt_p_text
+
+def cols_for_formula(formula):
+	"""Return the columns corresponding to the Patsy formula"""
+	
+	model_desc = patsy.ModelDesc.from_formula(formula)
+	cols = set()
+	for term in model_desc.rhs_termlist:
+		for factor in term.factors:
+			name = factor.name()
+			if '(' in name:
+				# FIXME: Is there a better way of doing this?
+				name = name[name.index('(')+1:name.index(')')]
+			
+			cols.add(name)
+	
+	return list(cols)
+
+# ----------
+# Regression
+
+class LikelihoodRatioTestResult:
+	"""Result of a likelihood ratio test for regression"""
+	
+	def __init__(self, statistic, dof, pvalue):
+		self.statistic = statistic
+		self.dof = dof
+		self.pvalue = pvalue
+	
+	def _repr_html_(self):
+		return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue))
+	
+	def summary(self):
+		return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue))
+
+class FTestResult:
+	"""Result of an F test for regression"""
+	
+	def __init__(self, statistic, dof_model, dof_resid, pvalue):
+		self.statistic = statistic
+		self.dof_model = dof_model
+		self.dof_resid = dof_resid
+		self.pvalue = pvalue
+	
+	def _repr_html_(self):
+		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p_html(self.pvalue))
+	
+	def summary(self):
+		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p_text(self.pvalue))
+
+class RegressionResult:
+	"""
+	Result of a regression
+	
+	llf/llnull: Log-likelihood of model/null model
+	"""
+	
+	def __init__(self,
+		raw_result,
+		full_name, model_name, fit_method,
+		dep, nobs, dof_model, fitted_dt,
+		beta, pvalues,
+		llf, llnull,
+		dof_resid, rsquared, f_statistic,
+		exp
+	):
+		# A copy of the raw result so we can access it
+		self.raw_result = raw_result
+		
+		# Information for display
+		self.full_name = full_name
+		self.model_name = model_name
+		self.fit_method = fit_method
+		
+		# Basic fitted model information
+		self.dep = dep
+		self.nobs = nobs
+		self.dof_model = dof_model
+		self.fitted_dt = fitted_dt
+		
+		# Regression coefficients
+		self.beta = beta
+		self.pvalues = pvalues
+		
+		# Model log-likelihood
+		self.llf = llf
+		self.llnull = llnull
+		
+		# Extra statistics (not all regression models have these)
+		self.dof_resid = dof_resid
+		self.rsquared = rsquared
+		self.f_statistic = f_statistic
+		
+		# Config for display style
+		self.exp = exp
+	
+	@property
+	def pseudo_rsquared(self):
+		"""McFadden's pseudo R-squared"""
+		
+		return 1 - self.llf/self.llnull
+	
+	def lrtest_null(self):
+		"""Compute the likelihood ratio test comparing the model to the null model"""
+		
+		statistic = -2 * (self.llnull - self.llf)
+		pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
+		
+		return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
+	
+	def ftest(self):
+		"""Perform the F test that all slopes are 0"""
+		
+		pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic)
+		return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue)
+	
+	def _header_table(self, html):
+		"""Return the entries for the header table"""
+		
+		# Left column
+		left_col = []
+		
+		left_col.append(('Dep. Variable:', self.dep))
+		left_col.append(('Model:', self.model_name))
+		left_col.append(('Method:', self.fit_method))
+		left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
+		left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
+		left_col.append(('No. Observations:', format(self.nobs, '.0f')))
+		
+		# Right column
+		right_col = []
+		
+		if self.dof_resid:
+			right_col.append(('Df. Residuals:', format(self.dof_resid, '.0f')))
+		right_col.append(('Df. Model:', format(self.dof_model, '.0f')))
+		if self.rsquared:
+			right_col.append(('<i>R</i><sup>2</sup>:' if html else 'R²:', format(self.rsquared, '.2f')))
+		else:
+			right_col.append(('Pseudo <i>R</i><sup>2</sup>:' if html else 'Pseudo R²:', format(self.pseudo_rsquared, '.2f')))
+		if self.f_statistic:
+			# Report the F test if available
+			f_result = self.ftest()
+			
+			if html:
+				right_col.append(('<i>F</i>:', format(f_result.statistic, '.2f')))
+				right_col.append(('<i>p</i> (<i>F</i>):', fmt_p_html(f_result.pvalue, True)))
+			else:
+				right_col.append(('F:', format(f_result.statistic, '.2f')))
+				right_col.append(('p (F):', fmt_p_text(f_result.pvalue, True)))
+		else:
+			# Otherwise report likelihood ratio test as overall test
+			lrtest_result = self.lrtest_null()
+			
+			right_col.append(('LL-Model:', format(self.llf, '.2f')))
+			right_col.append(('LL-Null:', format(self.llnull, '.2f')))
+			if html:
+				right_col.append(('<i>p</i> (LR):', fmt_p_html(lrtest_result.pvalue, True)))
+			else:
+				right_col.append(('p (LR):', fmt_p_text(lrtest_result.pvalue, True)))
+		
+		return left_col, right_col
+	
+	def _repr_html_(self):
+		# Render header table
+		left_col, right_col = self._header_table(html=True)
+		
+		out = '<table><caption>{} Results</caption>'.format(self.full_name)
+		for left_cell, right_cell in itertools.zip_longest(left_col, right_col):
+			out += '<tr><th>{}</th><td>{}</td><th>{}</th><td>{}</td></tr>'.format(
+				left_cell[0] if left_cell else '',
+				left_cell[1] if left_cell else '',
+				right_cell[0] if right_cell else '',
+				right_cell[1] if right_cell else ''
+			)
+		out += '</table>'
+		
+		# Render coefficients table
+		out += '<table><tr><th></th><th style="text-align:center">{}</th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr>'.format('exp(<i>β</i>)' if self.exp else '<i>β</i>')
+		
+		for term, beta in self.beta.items():
+			# Exponentiate if requested
+			if self.exp:
+				beta = np.exp(beta)
+			
+			out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term, beta.point, beta.ci_lower, beta.ci_upper, fmt_p_html(self.pvalues[term], True))
+		
+		out += '</table>'
+		
+		# TODO: Have a detailed view which shows SE, t/z, etc.
+		
+		return out
+	
+	def summary(self):
+		# Render header table
+		left_col, right_col = self._header_table(html=False)
+		
+		# Ensure equal sizes for SimpleTable
+		if len(right_col) > len(left_col):
+			left_col.extend([('', '')] * (len(right_col) - len(left_col)))
+		elif len(left_col) > len(right_col):
+			right_col.extend([('', '')] * (len(left_col) - len(right_col)))
+		
+		table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.full_name))
+		table1.insert_stubs(2, [' | '] * len(left_col))
+		
+		# Get rid of last line (merge with next table)
+		table1_lines = table1.as_text().splitlines(keepends=False)
+		out = '\n'.join(table1_lines[:-1]) + '\n'
+		
+		# Render coefficients table
+		table_data = []
+		
+		for term, beta in self.beta.items():
+			# Exponentiate if requested
+			if self.exp:
+				beta = np.exp(estimate)
+			
+			# Add some extra padding
+			table_data.append([term + '  ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), '  ' + fmt_p_text(self.pvalues[term], True)])
+		
+		table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p'])  # U+E000 is in Private Use Area, mark middle of CI column
+		table2_text = table2.as_text().replace('    \ue000   ', '(95% CI)')  # Render heading in the right spot
+		table2_lines = table2_text.splitlines(keepends=False)
+		
+		# Render divider line between 2 tables
+		max_table_len = max(len(table1_lines[-1]), len(table2_lines[-1]))
+		out += '=' * max_table_len + '\n'
+		
+		out += '\n'.join(table2_lines[1:])
+		
+		return out
+
+def regress(
+	model_class, df, dep, formula, *,
+	nan_policy='warn', exp=None
+):
+	"""Fit a statsmodels regression model"""
+	
+	# Autodetect whether to exponentiate
+	if exp is None:
+		if model_class is sm.Logit:
+			exp = True
+		else:
+			exp = False
+	
+	# Check for/clean NaNs
+	df = df[[dep] + cols_for_formula(formula)]
+	df = check_nan(df, nan_policy)
+	
+	# Ensure numeric type for dependent variable
+	if df[dep].dtype != 'float64':
+		df[dep] = df[dep].astype('float64')
+	
+	# Convert pandas nullable types for independent variables
+	for col in df.columns:
+		if df[col].dtype == 'Int64':
+			df[col] = df[col].astype('float64')
+	
+	# Fit model
+	model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df)
+	result = model.fit()
+	
+	confint = result.conf_int()
+	beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()}
+	
+	# Fit null model (for llnull)
+	if hasattr(result, 'llnull'):
+		llnull = result.llnull
+	else:
+		result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
+		llnull = result_null.llf
+	
+	# Parse raw regression results (to get fit method)
+	header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
+	header_dict = {x[0]: x[1] for x in header_entries}
+	
+	return RegressionResult(
+		result,
+		'Logistic Regression' if model_class is sm.Logit else '{} Regression'.format(model_class.__name__), model_class.__name__, header_dict['Method:'],
+		dep, result.nobs, result.df_model, datetime.now(),
+		beta, result.pvalues,
+		result.llf, llnull,
+		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
+		exp
+	)