Implement ttest_ind_multiple

2023-04-20 15:41:03 +10:00 · 2023-04-20 15:41:03 +10:00 · 844e6bdec9
commit 844e6bdec9
parent 2852d3dd19
3 changed files with 98 additions and 1 deletions
--- a/docs/sig_tests.rst
+++ b/docs/sig_tests.rst
@ -18,6 +18,8 @@ Functions

 .. autofunction:: yli.ttest_ind

+.. autofunction:: yli.ttest_ind_multiple
+
 Result classes
 --------------

@ -36,6 +38,9 @@ Result classes
 .. autoclass:: yli.sig_tests.MannWhitneyResult
 	:members:

+.. autoclass:: yli.sig_tests.MultipleTTestResult
+	:members:
+
 .. autoclass:: yli.sig_tests.PearsonChiSquaredResult
 	:members:
 	:inherited-members:
--- a/yli/init.py
+++ b/yli/init.py
@ -21,7 +21,7 @@ from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .graphs import init_fonts
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
 from .regress import IntervalCensoredCox, Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
-from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
+from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind, ttest_ind_multiple
 from .survival import kaplanmeier, logrank, turnbull
 from .utils import as_ordinal

--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -180,6 +180,98 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
 		delta=Estimate(delta, ci0, ci1),
 		delta_direction='{} > {}'.format(group1, group2))

+class MultipleTTestResult:
+	"""
+	Result of multiple Student's *t* tests, adjusted for multiplicity
+	
+	See :func:`yli.ttest_ind_multiple`.
+	"""
+	
+	def __init__(self, *, dep, results):
+		#: Name of the dependent variable (*str*)
+		self.dep = dep
+		#: Results of the *t* tests (*List[*\ :class:`TTestResult`\ *]*)
+		self.results = results
+	
+	def _comparison_table(self, html):
+		"""Return a table showing the means/SDs for each group"""
+		
+		group1 = self.results[0].group1
+		group2 = self.results[0].group2
+		
+		# TODO: Render HTML directly so can have proper HTML p values
+		table_data = []
+		for row in self.results:
+			cell1 = '{:.2f} ({:.2f})'.format(row.mu1, row.sd1)
+			cell2 = '{:.2f} ({:.2f})'.format(row.mu2, row.sd2)
+			cell_pvalue = fmt_p(row.pvalue, PValueStyle.TABULAR)
+			
+			# Display the cells the right way around
+			if row.group1 == group1 and row.group2 == group2:
+				table_data.append([cell1, cell2, cell_pvalue])
+			elif row.group1 == group2 and row.group2 == group1:
+				table_data.append([cell2, cell1, cell_pvalue])
+			else:
+				raise Exception('t tests have different groups')
+		
+		if html:
+			table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='\ue000 (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, '\ue001'], name=self.dep)) # U+E000 is in Private Use Area, mark μ symbol
+			table_str = table._repr_html_()
+			return table_str.replace('\ue000', '<i>μ</i>').replace('\ue001', '<i>p</i>')
+		else:
+			table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='μ (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, 'p'], name=self.dep))
+			return str(table)
+	
+	def __repr__(self):
+		if config.repr_is_summary:
+			return self.summary()
+		return super().__repr__()
+	
+	def _repr_html_(self):
+		return self._comparison_table(True)
+	
+	def summary(self):
+		"""
+		Return a stringified summary of the *t* tests
+		
+		:rtype: str
+		"""
+		return str(self._comparison_table(False))
+
+def ttest_ind_multiple(df, dep, inds, *, nan_policy='warn', method='hs'):
+	"""
+	Perform independent 2-sample Student's *t* tests with multiple independent variables, adjusting for multiplicity
+	
+	:param df: Data to perform the test on
+	:type df: DataFrame
+	:param dep: Column in *df* for the dependent variable (numeric)
+	:type dep: str
+	:param ind: Columns in *df* for the independent variables (dichotomous)
+	:type ind: List[str]
+	:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
+	:type nan_policy: str
+	:param method: Method to apply for multiplicity adjustment (see `statsmodels multipletests <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_)
+	:type method: str
+	
+	:rtype: :class:`yli.sig_tests.MultipleTTestResult`
+	"""
+	
+	# TODO: Unit testing
+	# FIXME: Assert groups of independent variables have same levels
+	
+	# Perform t tests
+	results = []
+	for ind in inds:
+		results.append(ttest_ind(df, dep, ind, nan_policy=nan_policy))
+	
+	# Adjust for multiplicity
+	_, pvalues_corrected, _, _ = sm.stats.multipletests([result.pvalue for result in results], alpha=config.alpha, method=method)
+	
+	for result, pvalue_corrected in zip(results, pvalues_corrected):
+		result.pvalue = pvalue_corrected
+	
+	return MultipleTTestResult(dep=dep, results=results)
+
 # -------------
 # One-way ANOVA