scipy-yli/yli/regress.py

#   scipy-yli: Helpful SciPy utilities and recipes
#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels
import statsmodels.api as sm
from statsmodels.iolib.table import SimpleTable
from statsmodels.stats.outliers_influence import variance_inflation_factor

from datetime import datetime
import itertools

from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config
from .sig_tests import FTestResult
from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term

def vif(df, formula=None, nan_policy='warn'):
	"""
	Calculate the variance inflation factor for each variable in df
	
	formula: If specified, calculate the VIF only for the variables in the formula
	"""
	
	if formula:
		# Only consider columns in the formula
		df = df[cols_for_formula(formula, df)]
	
	# Check for/clean NaNs
	df = check_nan(df, nan_policy)
	
	# Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..."
	df = pd.get_dummies(df, drop_first=True)  # Convert categorical dtypes
	df = df.astype('float64')  # Convert all other dtypes
	
	# Add intercept column
	orig_columns = list(df.columns)
	df['Intercept'] = [1] * len(df)
	
	vifs = {}
	
	for i, col in enumerate(orig_columns):
		vifs[col] = variance_inflation_factor(df, i)
	
	return pd.Series(vifs)

# ----------
# Regression

class LikelihoodRatioTestResult:
	"""Result of a likelihood ratio test for regression"""
	
	def __init__(self, statistic, dof, pvalue):
		self.statistic = statistic
		self.dof = dof
		self.pvalue = pvalue
	
	def _repr_html_(self):
		return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True))
	
	def summary(self):
		return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False))

class RegressionResult:
	"""
	Result of a regression
	
	llf/llnull: Log-likelihood of model/null model
	"""
	
	def __init__(self,
		raw_result,
		full_name, model_name, fit_method,
		dep, nobs, dof_model, fitted_dt,
		terms,
		llf, llnull,
		dof_resid, rsquared, f_statistic,
		exp
	):
		# A copy of the raw result so we can access it
		self.raw_result = raw_result
		
		# Information for display
		self.full_name = full_name
		self.model_name = model_name
		self.fit_method = fit_method
		
		# Basic fitted model information
		self.dep = dep
		self.nobs = nobs
		self.dof_model = dof_model
		self.fitted_dt = fitted_dt
		
		# Regression coefficients/p values
		self.terms = terms
		
		# Model log-likelihood
		self.llf = llf
		self.llnull = llnull
		
		# Extra statistics (not all regression models have these)
		self.dof_resid = dof_resid
		self.rsquared = rsquared
		self.f_statistic = f_statistic
		
		# Config for display style
		self.exp = exp
	
	@property
	def pseudo_rsquared(self):
		"""McFadden's pseudo R-squared"""
		
		return 1 - self.llf/self.llnull
	
	def lrtest_null(self):
		"""Compute the likelihood ratio test comparing the model to the null model"""
		
		statistic = -2 * (self.llnull - self.llf)
		pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
		
		return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
	
	def ftest(self):
		"""Perform the F test that all slopes are 0"""
		
		pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic)
		return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue)
	
	def bayesfactor_beta_zero(self, term):
		"""
		Compute Bayes factor testing the hypothesis that the given beta is 0
		Requires statsmodels regression
		"""
		
		# Get parameters required for AFBF
		params = pd.Series({term.raw_name.replace('[', '_').replace(']', '_'): term.beta.point for term in self.terms.values()})
		cov = self.raw_result.cov_params()
		
		# Compute BF matrix
		bf01 = bayesfactor_afbf(params, cov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_')))
		bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term))
		
		if bf01.factor >= 1:
			return bf01
		else:
			return bf01.invert()
	
	def _header_table(self, html):
		"""Return the entries for the header table"""
		
		# Left column
		left_col = []
		
		left_col.append(('Dep. Variable:', self.dep))
		left_col.append(('Model:', self.model_name))
		left_col.append(('Method:', self.fit_method))
		left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
		left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
		left_col.append(('No. Observations:', format(self.nobs, '.0f')))
		
		# Right column
		right_col = []
		
		right_col.append(('Df. Model:', format(self.dof_model, '.0f')))
		if self.dof_resid:
			right_col.append(('Df. Residuals:', format(self.dof_resid, '.0f')))
		if self.rsquared:
			right_col.append(('<i>R</i><sup>2</sup>:' if html else 'R²:', format(self.rsquared, '.2f')))
		else:
			right_col.append(('Pseudo <i>R</i><sup>2</sup>:' if html else 'Pseudo R²:', format(self.pseudo_rsquared, '.2f')))
		if self.f_statistic:
			# Report the F test if available
			f_result = self.ftest()
			
			if html:
				right_col.append(('<i>F</i>:', format(f_result.statistic, '.2f')))
				right_col.append(('<i>p</i> (<i>F</i>):', fmt_p(f_result.pvalue, html=True, tabular=True)))
			else:
				right_col.append(('F:', format(f_result.statistic, '.2f')))
				right_col.append(('p (F):', fmt_p(f_result.pvalue, html=False, tabular=True)))
		else:
			# Otherwise report likelihood ratio test as overall test
			lrtest_result = self.lrtest_null()
			
			right_col.append(('LL-Model:', format(self.llf, '.2f')))
			right_col.append(('LL-Null:', format(self.llnull, '.2f')))
			if html:
				right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, html=True, tabular=True)))
			else:
				right_col.append(('p (LR):', fmt_p(lrtest_result.pvalue, html=False, tabular=True)))
		
		return left_col, right_col
	
	def _repr_html_(self):
		# Render header table
		left_col, right_col = self._header_table(html=True)
		
		out = '<table><caption>{} Results</caption>'.format(self.full_name)
		for left_cell, right_cell in itertools.zip_longest(left_col, right_col):
			out += '<tr><th>{}</th><td>{}</td><th>{}</th><td>{}</td></tr>'.format(
				left_cell[0] if left_cell else '',
				left_cell[1] if left_cell else '',
				right_cell[0] if right_cell else '',
				right_cell[1] if right_cell else ''
			)
		out += '</table>'
		
		# Render coefficients table
		out += '<table><tr><th></th><th style="text-align:center">{}</th><th colspan="3" style="text-align:center">({:g}% CI)</th><th style="text-align:center"><i>p</i></th></tr>'.format('exp(<i>β</i>)' if self.exp else '<i>β</i>', (1-config.alpha)*100)
		
		for term_name, term in self.terms.items():
			if isinstance(term, SingleTerm):
				# Single term
				
				# Exponentiate beta if requested
				beta = term.beta
				if self.exp:
					beta = np.exp(beta)
				
				out += '<tr><th>{}</th><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(term.pvalue, html=True, tabular=True))
			elif isinstance(term, CategoricalTerm):
				# Categorical term
				out += '<tr><th>{}</th><td></td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term_name)
				
				# Render reference category
				out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
				
				# Loop over terms
				for sub_term_name, sub_term in term.categories.items():
					# Exponentiate beta if requested
					beta = sub_term.beta
					if self.exp:
						beta = np.exp(beta)
					
					out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>{:.2f}</td><td style="padding-right:0">({:.2f}</td><td>–</td><td style="padding-left:0">{:.2f})</td><td style="text-align:left">{}</td></tr>'.format(sub_term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(sub_term.pvalue, html=True, tabular=True))
			else:
				raise Exception('Attempt to render unknown term type')
		
		out += '</table>'
		
		# TODO: Have a detailed view which shows SE, t/z, etc.
		
		return out
	
	def summary(self):
		# Render header table
		left_col, right_col = self._header_table(html=False)
		
		# Ensure equal sizes for SimpleTable
		if len(right_col) > len(left_col):
			left_col.extend([('', '')] * (len(right_col) - len(left_col)))
		elif len(left_col) > len(right_col):
			right_col.extend([('', '')] * (len(left_col) - len(right_col)))
		
		table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.full_name))
		table1.insert_stubs(2, [' | '] * len(left_col))
		
		# Get rid of last line (merge with next table)
		table1_lines = table1.as_text().splitlines(keepends=False)
		out = '\n'.join(table1_lines[:-1]) + '\n'
		
		# Render coefficients table
		table_data = []
		
		for term_name, term in self.terms.items():
			if isinstance(term, SingleTerm):
				# Single term
				
				# Exponentiate beta if requested
				beta = term.beta
				if self.exp:
					beta = np.exp(beta)
				
				# Add some extra padding
				table_data.append([term_name + '  ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), '  ' + fmt_p(term.pvalue, html=False, tabular=True)])
			elif isinstance(term, CategoricalTerm):
				# Categorical term
				table_data.append([term_name + '  ', '', '', '', '', ''])
				
				# Render reference category
				table_data.append(['{}  '.format(term.ref_category), 'Ref.', '', '', '', ''])
				
				# Loop over terms
				for sub_term_name, sub_term in term.categories.items():
					# Exponentiate beta if requested
					beta = sub_term.beta
					if self.exp:
						beta = np.exp(beta)
					
					table_data.append([sub_term_name + '  ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), '  ' + fmt_p(sub_term.pvalue, html=False, tabular=True)])
			else:
				raise Exception('Attempt to render unknown term type')
		
		table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p'])  # U+E000 is in Private Use Area, mark middle of CI column
		table2_text = table2.as_text().replace('    \ue000   ', '({:g}% CI)'.format((1-config.alpha)*100))  # Render heading in the right spot
		table2_lines = table2_text.splitlines(keepends=False)
		
		# Render divider line between 2 tables
		max_table_len = max(len(table1_lines[-1]), len(table2_lines[-1]))
		out += '=' * max_table_len + '\n'
		
		out += '\n'.join(table2_lines[1:])
		
		return out

class SingleTerm:
	"""
	A term in a RegressionResult which is a single term
	
	raw_name: The raw name of the term (e.g. in RegressionResult.raw_result data)
	beta: An Estimate of the coefficient
	pvalue: The p value
	"""
	
	def __init__(self, raw_name, beta, pvalue):
		self.raw_name = raw_name
		self.beta = beta
		self.pvalue = pvalue

class CategoricalTerm:
	"""A group of terms in a RegressionResult corresponding to a categorical variable"""
	
	def __init__(self, categories, ref_category):
		self.categories = categories
		self.ref_category = ref_category

def regress(
	model_class, df, dep, formula, *,
	nan_policy='warn',
	model_kwargs=None, fit_kwargs=None,
	family=None,  # common model_kwargs
	cov_type=None, maxiter=None, start_params=None,  # common fit_kwargs
	bool_baselevels=False, exp=None
):
	"""
	Fit a statsmodels regression model
	
	bool_baselevels: Show reference categories for boolean independent variables even if reference category is False
	exp: Report exponentiated parameters rather than raw parameters
	"""
	
	# Populate model_kwargs
	if model_kwargs is None:
		model_kwargs = {}
	if family is not None:
		model_kwargs['family'] = family
	
	# Populate fit_kwargs
	if fit_kwargs is None:
		fit_kwargs = {}
	if cov_type is not None:
		fit_kwargs['cov_type'] = cov_type
	if maxiter is not None:
		fit_kwargs['maxiter'] = maxiter
	if start_params is not None:
		fit_kwargs['start_params'] = start_params
	
	# Autodetect whether to exponentiate
	if exp is None:
		if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
			exp = True
		else:
			exp = False
	
	# Check for/clean NaNs
	df = df[[dep] + cols_for_formula(formula, df)]
	df = check_nan(df, nan_policy)
	
	# Ensure numeric type for dependent variable
	if df[dep].dtype != 'float64':
		df[dep] = df[dep].astype('float64')
	
	# Convert pandas nullable types for independent variables
	for col in df.columns:
		if df[col].dtype == 'Int64':
			df[col] = df[col].astype('float64')
	
	# Fit model
	model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df, **model_kwargs)
	result = model.fit(**fit_kwargs)
	
	if isinstance(result, RegressionResult):
		# Already processed!
		result.exp = exp
		return result
	
	# Process terms
	terms = {}
	
	confint = result.conf_int(config.alpha)
	
	for raw_name, raw_beta in result.params.items():
		beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name])
		
		# Rename terms
		if raw_name == 'Intercept':
			# Intercept term (single term)
			term = '(Intercept)'
			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
		else:
			# Parse if required
			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
			
			if contrast is not None:
				# Categorical term
				
				if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]):
					# Treat as single term
					terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
				else:
					# Add a new categorical term if not exists
					if column not in terms:
						ref_category = formula_factor_ref_category(formula, df, factor)
						terms[column] = CategoricalTerm({}, ref_category)
					
					terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
			else:
				# Single term
				terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
	
	# Fit null model (for llnull)
	if hasattr(result, 'llnull'):
		llnull = result.llnull
	else:
		result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
		llnull = result_null.llf
	
	# Parse raw regression results (to get fit method)
	header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
	header_dict = {x[0]: x[1] for x in header_entries}
	
	# Get full name to display
	if model_class is sm.Logit:
		full_name = 'Logistic Regression'
	else:
		full_name = '{} Regression'.format(model_class.__name__)
	if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust':
		full_name = 'Robust {}'.format(full_name)
	
	return RegressionResult(
		result,
		full_name, model_class.__name__, header_dict['Method:'],
		dep, result.nobs, result.df_model, datetime.now(),
		terms,
		result.llf, llnull,
		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
		exp
	)

def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
	"""Perform logistic regression, then use parameters as start parameters for desired regression"""
	
	# Check for/clean NaNs
	# Do this once here so we only get 1 warning
	df = df[[dep] + cols_for_formula(formula, df)]
	df = check_nan(df, nan_policy)
	
	# Perform logistic regression
	logit_result = regress(sm.Logit, df, dep, formula, **kwargs)
	logit_params = logit_result.raw_result.params
	
	# Perform desired regression
	return regress(model_class, df, dep, formula, start_params=logit_params, **kwargs)

# -----------------------------
# Penalised logistic regression

class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
	"""
	Statsmodel-compatible model for computing Firth penalised logistic regression
	Uses R "logistf" library
	
	NB: This class expects to be used in the context of yli.regress()
	"""
	
	def fit(self):
		import rpy2.robjects as ro
		import rpy2.robjects.packages
		import rpy2.robjects.pandas2ri
		
		# Assume data is already cleaned from regress()
		df = self.data.frame.copy()
		
		# Convert bool to int otherwise rpy2 chokes
		df = df.replace({False: 0, True: 1})
		
		# Import logistf
		ro.packages.importr('logistf')
		
		with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
			with ro.local_context() as lc:
				# Convert DataFrame to R
				lc['df'] = df
				
				# Transfer other parameters to R
				lc['formula_'] = self.formula
				lc['alpha'] = config.alpha
				
				# Fit the model
				model = ro.r('logistf(formula_, data=df, alpha=alpha)')
				
				# TODO: Handle categorical terms?
				terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
				
				return RegressionResult(
					model,
					'Penalised Logistic Regression', 'Logit', 'Penalised ML',
					self.endog_names, model['n'][0], model['df'][0], datetime.now(),
					terms,
					model['loglik'][0], model['loglik'][1],
					None, None, None,
					None  # Set exp in regress()
				)