2022-10-11 22:52:42 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
2023-02-26 00:05:10 +11:00
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
2022-10-11 22:52:42 +11:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2022-10-13 13:25:24 +11:00
import numpy as np
2022-10-11 22:52:42 +11:00
import pandas as pd
from scipy import stats
import statsmodels . api as sm
import functools
import warnings
2022-10-15 23:24:29 +11:00
from . config import config
2022-12-03 20:00:29 +11:00
from . utils import Estimate , Interval , PValueStyle , as_2groups , as_numeric , check_nan , convert_pandas_nullable , fmt_p
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# ----------------
# Student's t test
2022-10-11 22:52:42 +11:00
class TTestResult :
"""
2022-10-17 21:41:19 +11:00
Result of a Student ' s *t* test
2022-10-11 22:52:42 +11:00
2022-10-17 21:41:19 +11:00
See : func : ` yli . ttest_ind ` .
2022-10-11 22:52:42 +11:00
"""
2022-11-09 22:25:17 +11:00
def __init__ ( self , * , statistic , dof , pvalue , dep , ind , group1 , group2 , mu1 , mu2 , sd1 , sd2 , delta , delta_direction ) :
2022-10-17 21:41:19 +11:00
#: *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom of the *t* distribution (*int*)
2022-10-11 22:52:42 +11:00
self . dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self . pvalue = pvalue
2022-11-09 22:08:27 +11:00
2022-11-09 22:25:17 +11:00
#: Name of the dependent variable (*str*)
self . dep = dep
#: Name of the independent variable (*str*)
self . ind = ind
2022-11-09 17:01:45 +11:00
#: Name of the first group (*str*)
self . group1 = group1
#: Name of the second group (*str*)
self . group2 = group2
#: Mean of the first group (*float*)
self . mu1 = mu1
#: Mean of the second group (*float*)
self . mu2 = mu2
#: Standard deviation of the first group (*float*)
self . sd1 = sd1
#: Standard deviation of the second group (*float*)
self . sd2 = sd2
2022-11-09 22:08:27 +11:00
2022-10-17 21:41:19 +11:00
#: Absolute value of the mean difference (:class:`yli.utils.Estimate`)
2022-10-11 22:52:42 +11:00
self . delta = delta
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 12:53:18 +11:00
self . delta_direction = delta_direction
2022-10-11 22:52:42 +11:00
2022-11-09 18:18:47 +11:00
def _comparison_table ( self , html ) :
""" Return a table showing the means/SDs for each group """
2022-11-09 22:25:17 +11:00
table_data = [ [
' {:.2f} ( {:.2f} ) ' . format ( self . mu1 , self . sd1 ) ,
' {:.2f} ( {:.2f} ) ' . format ( self . mu2 , self . sd2 ) ,
] ]
2022-11-09 18:18:47 +11:00
if html :
2022-11-09 22:25:17 +11:00
table = pd . DataFrame ( table_data , index = pd . Index ( [ ' \ue000 (SD) ' ] , name = self . dep ) , columns = pd . Index ( [ self . group1 , self . group2 ] , name = self . ind ) ) # U+E000 is in Private Use Area, mark μ symbol
2022-11-09 18:18:47 +11:00
table_str = table . _repr_html_ ( )
return table_str . replace ( ' \ue000 ' , ' <i>μ</i> ' )
else :
2022-11-09 22:25:17 +11:00
table = pd . DataFrame ( table_data , index = pd . Index ( [ ' μ (SD) ' ] , name = self . dep ) , columns = pd . Index ( [ self . group1 , self . group2 ] , name = self . ind ) )
2022-11-09 18:18:47 +11:00
return str ( table )
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-11 22:52:42 +11:00
def _repr_html_ ( self ) :
2022-11-09 18:18:47 +11:00
return ' {} <br><i>t</i>( {:.0f} ) = {:.2f} ; <i>p</i> {} <br>Δ<i>μ</i> ( {:g} % CI) = {} , {} ' . format ( self . _comparison_table ( True ) , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) , ( 1 - config . alpha ) * 100 , self . delta . summary ( ) , self . delta_direction )
2022-10-11 22:52:42 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the * t * test
: rtype : str
"""
2022-11-09 18:18:47 +11:00
return ' {} \n \n t( {:.0f} ) = {:.2f} ; p {} \n Δμ ( {:g} % CI) = {} , {} ' . format ( self . _comparison_table ( False ) , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) , ( 1 - config . alpha ) * 100 , self . delta . summary ( ) , self . delta_direction )
2022-11-09 23:31:27 +11:00
def summary_short ( self , html ) :
"""
Return a stringified summary of the * t * test ( * t * statistic only )
: rtype : str
"""
if html :
return ' <i>t</i>( {:.0f} ) = {:.2f} ' . format ( self . dof , self . statistic )
else :
return ' t( {:.0f} ) = {:.2f} ' . format ( self . dof , self . statistic )
2022-10-11 22:52:42 +11:00
def ttest_ind ( df , dep , ind , * , nan_policy = ' warn ' ) :
2022-10-17 21:41:19 +11:00
"""
Perform an independent 2 - sample Student ' s *t* test
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param ind : Column in * df * for the independent variable ( dichotomous )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . TTestResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Type ' : [ ' Fresh ' ] * 10 + [ ' Stored ' ] * 10 ,
' Potency ' : [ 10.2 , 10.5 , 10.3 , . . . ]
} )
yli . ttest_ind ( df , ' Potency ' , ' Type ' )
. . code - block : : text
2022-11-09 22:25:17 +11:00
Type Fresh Stored
Potency
μ ( SD ) 10.37 ( 0.32 ) 9.83 ( 0.24 )
2022-10-18 19:25:12 +11:00
t ( 18 ) = 4.24 ; p < 0.001 *
Δμ ( 95 % CI ) = 0.54 ( 0.27 – 0.81 ) , Fresh > Stored
The output states that the value of the * t * statistic is 4.24 , the * t * distribution has 18 degrees of freedom , and the test is significant with * p * value < 0.001 .
The mean difference is 0.54 in favour of the * Fresh * group , with 95 % confidence interval 0.27 – 0.81 .
2022-10-17 21:41:19 +11:00
"""
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# Check for/clean NaNs
2022-10-11 22:52:42 +11:00
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
2022-10-13 12:53:18 +11:00
# Ensure 2 groups for ind
group1 , data1 , group2 , data2 = as_2groups ( df , dep , ind )
2022-10-11 22:52:42 +11:00
# Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically
2022-11-09 17:01:45 +11:00
d1 = sm . stats . DescrStatsW ( data1 , ddof = 1 )
d2 = sm . stats . DescrStatsW ( data2 , ddof = 1 )
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
cm = sm . stats . CompareMeans ( d1 , d2 )
2022-10-11 22:52:42 +11:00
statistic , pvalue , dof = cm . ttest_ind ( )
2022-10-13 12:53:18 +11:00
delta = d1 . mean - d2 . mean
2022-10-15 23:24:29 +11:00
ci0 , ci1 = cm . tconfint_diff ( config . alpha )
2022-10-11 22:52:42 +11:00
2022-11-09 17:01:45 +11:00
# t test is symmetric so take absolute value
if d2 . mean > d1 . mean :
delta , ci0 , ci1 = - delta , - ci1 , - ci0
d1 , d2 = d2 , d1
group1 , group2 = group2 , group1
# Now group1 > group2
2022-10-13 12:53:18 +11:00
return TTestResult (
statistic = abs ( statistic ) , dof = dof , pvalue = pvalue ,
2022-11-09 22:25:17 +11:00
dep = dep , ind = ind ,
2022-11-09 17:01:45 +11:00
group1 = group1 , group2 = group2 , mu1 = d1 . mean , mu2 = d2 . mean , sd1 = d1 . std , sd2 = d2 . std ,
delta = Estimate ( delta , ci0 , ci1 ) ,
delta_direction = ' {} > {} ' . format ( group1 , group2 ) )
2022-10-13 13:14:51 +11:00
2023-04-20 15:41:03 +10:00
class MultipleTTestResult :
"""
Result of multiple Student ' s *t* tests, adjusted for multiplicity
See : func : ` yli . ttest_ind_multiple ` .
"""
def __init__ ( self , * , dep , results ) :
#: Name of the dependent variable (*str*)
self . dep = dep
#: Results of the *t* tests (*List[*\ :class:`TTestResult`\ *]*)
self . results = results
def _comparison_table ( self , html ) :
""" Return a table showing the means/SDs for each group """
group1 = self . results [ 0 ] . group1
group2 = self . results [ 0 ] . group2
# TODO: Render HTML directly so can have proper HTML p values
table_data = [ ]
for row in self . results :
cell1 = ' {:.2f} ( {:.2f} ) ' . format ( row . mu1 , row . sd1 )
cell2 = ' {:.2f} ( {:.2f} ) ' . format ( row . mu2 , row . sd2 )
cell_pvalue = fmt_p ( row . pvalue , PValueStyle . TABULAR )
# Display the cells the right way around
if row . group1 == group1 and row . group2 == group2 :
table_data . append ( [ cell1 , cell2 , cell_pvalue ] )
elif row . group1 == group2 and row . group2 == group1 :
table_data . append ( [ cell2 , cell1 , cell_pvalue ] )
else :
raise Exception ( ' t tests have different groups ' )
if html :
table = pd . DataFrame ( table_data , index = pd . Index ( [ row . ind for row in self . results ] , name = ' \ue000 (SD) ' ) , columns = pd . Index ( [ self . results [ 0 ] . group1 , self . results [ 0 ] . group2 , ' \ue001 ' ] , name = self . dep ) ) # U+E000 is in Private Use Area, mark μ symbol
table_str = table . _repr_html_ ( )
return table_str . replace ( ' \ue000 ' , ' <i>μ</i> ' ) . replace ( ' \ue001 ' , ' <i>p</i> ' )
else :
table = pd . DataFrame ( table_data , index = pd . Index ( [ row . ind for row in self . results ] , name = ' μ (SD) ' ) , columns = pd . Index ( [ self . results [ 0 ] . group1 , self . results [ 0 ] . group2 , ' p ' ] , name = self . dep ) )
return str ( table )
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
def _repr_html_ ( self ) :
return self . _comparison_table ( True )
def summary ( self ) :
"""
Return a stringified summary of the * t * tests
: rtype : str
"""
return str ( self . _comparison_table ( False ) )
def ttest_ind_multiple ( df , dep , inds , * , nan_policy = ' warn ' , method = ' hs ' ) :
"""
Perform independent 2 - sample Student ' s *t* tests with multiple independent variables, adjusting for multiplicity
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param ind : Columns in * df * for the independent variables ( dichotomous )
: type ind : List [ str ]
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param method : Method to apply for multiplicity adjustment ( see ` statsmodels multipletests < https : / / www . statsmodels . org / dev / generated / statsmodels . stats . multitest . multipletests . html > ` _ )
: type method : str
: rtype : : class : ` yli . sig_tests . MultipleTTestResult `
"""
# TODO: Unit testing
# FIXME: Assert groups of independent variables have same levels
# Perform t tests
results = [ ]
for ind in inds :
results . append ( ttest_ind ( df , dep , ind , nan_policy = nan_policy ) )
# Adjust for multiplicity
_ , pvalues_corrected , _ , _ = sm . stats . multipletests ( [ result . pvalue for result in results ] , alpha = config . alpha , method = method )
for result , pvalue_corrected in zip ( results , pvalues_corrected ) :
result . pvalue = pvalue_corrected
return MultipleTTestResult ( dep = dep , results = results )
2022-10-14 20:18:25 +11:00
# -------------
# One-way ANOVA
class FTestResult :
2022-10-17 21:41:19 +11:00
"""
Result of an * F * test for ANOVA / regression
2023-04-16 23:52:12 +10:00
See : func : ` yli . anova_oneway ` and : meth : ` yli . regress . RegressionModel . ftest ` .
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
def __init__ ( self , statistic , dof1 , dof2 , pvalue ) :
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution numerator (*int*)
2022-10-14 20:18:25 +11:00
self . dof1 = dof1
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution denominator (*int*)
2022-10-14 20:18:25 +11:00
self . dof2 = dof2
2022-10-17 21:41:19 +11:00
#: *p* value for the *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self . pvalue = pvalue
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-14 20:18:25 +11:00
def _repr_html_ ( self ) :
2022-11-09 17:39:33 +11:00
return ' <i>F</i>( {:.0f} , {:.0f} ) = {:.2f} ; <i>p</i> {} ' . format ( self . dof1 , self . dof2 , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-14 20:18:25 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the * F * test
: rtype : str
"""
2022-11-09 17:39:33 +11:00
return ' F( {:.0f} , {:.0f} ) = {:.2f} ; p {} ' . format ( self . dof1 , self . dof2 , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-10-14 20:18:25 +11:00
2022-11-10 18:28:27 +11:00
def anova_oneway ( df , dep , ind , * , nan_policy = ' warn ' ) :
2022-10-17 21:41:19 +11:00
"""
Perform one - way ANOVA
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param ind : Column in * df * for the independent variable ( categorical )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . FTestResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Method ' : [ 1 ] * 8 + [ 2 ] * 7 + [ 3 ] * 9 ,
' Score ' : [ 96 , 79 , 91 , . . . ]
} )
yli . anova_oneway ( df , ' Score ' , ' Method ' )
. . code - block : : text
F ( 2 , 21 ) = 29.57 ; p < 0.001 *
The output states that the value of the * F * statistic is 29.57 , the * F * distribution has 2 degrees of freedom in the numerator and 21 in the denominator , and the test is significant with * p * value < 0.001 .
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
# Group by independent variable
groups = df . groupby ( ind ) [ dep ]
# Perform one-way ANOVA
result = stats . f_oneway ( * [ groups . get_group ( k ) for k in groups . groups ] )
# See stats.f_oneway implementation
dfbn = len ( groups . groups ) - 1
dfwn = len ( df ) - len ( groups . groups )
return FTestResult ( result . statistic , dfbn , dfwn , result . pvalue )
2022-10-13 13:14:51 +11:00
# -----------------
# Mann-Whitney test
class MannWhitneyResult :
"""
2022-11-10 18:28:27 +11:00
Result of a Mann – Whitney * U * test
2022-10-13 13:14:51 +11:00
2022-10-17 21:41:19 +11:00
See : func : ` yli . mannwhitney ` .
2022-10-13 13:14:51 +11:00
"""
2022-11-09 22:25:17 +11:00
def __init__ ( self , * , statistic , pvalue , dep , ind , group1 , group2 , med1 , med2 , iqr1 , iqr2 , range1 , range2 , rank_biserial , direction , brunnermunzel = None ) :
2022-10-18 19:25:12 +11:00
#: Lesser of the two Mann–Whitney *U* statistics (*float*)
2022-10-13 13:14:51 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *U* statistic (*float*)
2022-10-13 13:14:51 +11:00
self . pvalue = pvalue
2022-11-09 22:08:27 +11:00
2022-11-09 22:25:17 +11:00
#: Name of the dependent variable (*str*)
self . dep = dep
#: Name of the independent variable (*str*)
self . ind = ind
2022-11-09 22:08:27 +11:00
#: Name of the first group (*str*)
self . group1 = group1
#: Name of the second group (*str*)
self . group2 = group2
#: Median of the first group (*float*)
self . med1 = med1
#: Median of the second group (*float*)
self . med2 = med2
2022-11-09 22:10:15 +11:00
#: Interquartile range of the first group (:class:`yli.utils.Interval`)
2022-11-09 22:08:27 +11:00
self . iqr1 = iqr1
2022-11-09 22:10:15 +11:00
#: Interquartile range of the second group (:class:`yli.utils.Interval`)
2022-11-09 22:08:27 +11:00
self . iqr2 = iqr2
2022-11-09 22:10:15 +11:00
#: Range of the first group (:class:`yli.utils.Interval`)
2022-11-09 22:08:27 +11:00
self . range1 = range2
2022-11-09 22:10:15 +11:00
#: Range of the second group (:class:`yli.utils.Interval`)
2022-11-09 22:08:27 +11:00
self . range2 = range2
2022-10-17 21:41:19 +11:00
#: Absolute value of the rank-biserial correlation (*float*)
2022-10-13 13:14:51 +11:00
self . rank_biserial = rank_biserial
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 13:14:51 +11:00
self . direction = direction
2022-11-09 22:08:27 +11:00
2022-10-17 21:41:19 +11:00
#: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A
2022-10-13 13:14:51 +11:00
self . brunnermunzel = brunnermunzel
2022-11-09 22:08:27 +11:00
def _comparison_table ( self , html ) :
""" Return a table showing the medians/IQRs/ranges for each group """
2022-11-09 22:25:17 +11:00
table_data = [
[ ' {:.2f} ( {} ) ' . format ( self . med1 , self . iqr1 . summary ( ) ) , ' {:.2f} ( {} ) ' . format ( self . med2 , self . iqr2 . summary ( ) ) ] ,
[ ' {:.2f} ( {} ) ' . format ( self . med1 , self . range1 . summary ( ) ) , ' {:.2f} ( {} ) ' . format ( self . med2 , self . range2 . summary ( ) ) ] ,
]
table = pd . DataFrame ( table_data , index = pd . Index ( [ ' Median (IQR) ' , ' Median (range) ' ] , name = self . dep ) , columns = pd . Index ( [ self . group1 , self . group2 ] , name = self . ind ) )
2022-11-09 22:08:27 +11:00
if html :
return table . _repr_html_ ( )
else :
return str ( table )
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 13:14:51 +11:00
def _repr_html_ ( self ) :
2022-11-09 22:08:27 +11:00
line1 = ' {} <br><i>U</i> = {:.1f} ; <i>p</i> {} <br><i>r</i> = {:.2f} , {} ' . format ( self . _comparison_table ( True ) , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) , self . rank_biserial , self . direction )
2022-10-13 13:14:51 +11:00
if self . brunnermunzel :
return line1 + ' <br> ' + self . brunnermunzel . _repr_html_ ( )
else :
return line1
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the Mann – Whitney test
: rtype : str
"""
2022-11-09 22:08:27 +11:00
line1 = ' {} \n \n U = {:.1f} ; p {} \n r = {:.2f} , {} ' . format ( self . _comparison_table ( False ) , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) , self . rank_biserial , self . direction )
2022-10-13 13:14:51 +11:00
if self . brunnermunzel :
return line1 + ' \n ' + self . brunnermunzel . summary ( )
else :
return line1
2022-11-09 23:31:27 +11:00
def summary_short ( self , html ) :
"""
Return a stringified summary of the Mann – Whitney test ( * U * statistic only )
: rtype : str
"""
if html :
return ' <i>U</i> = {:.1f} ' . format ( self . statistic )
else :
return ' U = {:.1f} ' . format ( self . statistic )
2022-10-13 13:14:51 +11:00
class BrunnerMunzelResult :
2022-10-17 21:41:19 +11:00
"""
Result of a Brunner – Munzel test
See : func : ` yli . mannwhitney ` . This library calls the Brunner – Munzel test statistic * W * .
"""
2022-10-13 13:14:51 +11:00
""" Result of a Brunner-Munzel test """
def __init__ ( self , statistic , pvalue ) :
2022-10-17 21:41:19 +11:00
#: *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self . pvalue = pvalue
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 13:14:51 +11:00
def _repr_html_ ( self ) :
2022-11-09 17:39:33 +11:00
return ' <i>W</i> = {:.1f} ; <i>p</i> {} ' . format ( self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-13 13:14:51 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the Brunner – Munzel test
: rtype : str
"""
2022-11-09 17:39:33 +11:00
return ' W = {:.1f} ; p {} ' . format ( self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-10-13 13:14:51 +11:00
def mannwhitney ( df , dep , ind , * , nan_policy = ' warn ' , brunnermunzel = True , use_continuity = False , alternative = ' two-sided ' , method = ' auto ' ) :
"""
2022-11-10 18:28:27 +11:00
Perform a Mann – Whitney * U * test
2022-10-17 21:41:19 +11:00
By default , this function performs a Brunner – Munzel test if the Mann – Whitney test is significant .
If the Mann – Whitney test is significant but the Brunner – Munzel test is not , a warning is raised .
The Brunner – Munzel test is returned only if non - significant .
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param ind : Column in * df * for the independent variable ( dichotomous )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param brunnermunzel : Whether to compute the Brunner – Munzel test if the Mann – Whitney test is significant
: type brunnermunzel : bool
: param use_continuity : See * scipy . stats . mannwhitneyu *
: param alternative : See * scipy . stats . mannwhitneyu *
: param method : See * scipy . stats . mannwhitneyu *
2022-11-09 17:05:04 +11:00
: return : The result of the Mann – Whitney test . The result of a Brunner – Munzel test is included in the result object if and only if * brunnermunzel * is * True * , * and * the Mann – Whitney test is significant , * and * the Brunner – Munzel test is non - significant .
2022-10-20 20:57:24 +11:00
2022-10-17 21:41:19 +11:00
: rtype : : class : ` yli . sig_tests . MannWhitneyResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Sample ' : [ ' Before ' ] * 12 + [ ' After ' ] * 12 ,
' Oxygen ' : [ 11.0 , 11.2 , 11.2 , . . . ]
} )
yli . mannwhitney ( df , ' Oxygen ' , ' Sample ' , method = ' asymptotic ' , alternative = ' less ' )
. . code - block : : text
2022-11-09 22:25:17 +11:00
Sample After Before
Oxygen
2022-11-09 22:08:27 +11:00
Median ( IQR ) 10.75 ( 10.55 – 10.95 ) 11.55 ( 11.20 – 11.83 )
Median ( range ) 10.75 ( 11.00 – 12.10 ) 11.55 ( 11.00 – 12.10 )
2022-10-18 19:25:12 +11:00
U = 6.0 ; p < 0.001 *
r = 0.92 , Before > After
The output states that the value of the Mann – Whitney * U * statistic is 6.0 , and the one - sided test is significant with asymptotic * p * value < 0.001 .
The rank - biserial correlation is 0.92 in favour of the * Before * group .
2022-10-13 13:14:51 +11:00
"""
# Check for/clean NaNs
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
2022-11-09 23:31:27 +11:00
# Convert pandas nullable types for independent variables as this breaks mannwhitneyu
2022-10-20 20:58:42 +11:00
df = convert_pandas_nullable ( df )
2022-10-13 13:14:51 +11:00
# Ensure 2 groups for ind
group1 , data1 , group2 , data2 = as_2groups ( df , dep , ind )
2023-02-07 18:49:57 +11:00
# Ensure numeric, factorising if required
data1 , _ = as_numeric ( data1 )
data2 , _ = as_numeric ( data2 )
2022-10-13 13:14:51 +11:00
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats . mannwhitneyu ( data1 , data2 , use_continuity = use_continuity , alternative = alternative , method = method )
u1 = result . statistic
u2 = len ( data1 ) * len ( data2 ) - u1
r = abs ( 2 * u1 / ( len ( data1 ) * len ( data2 ) ) - 1 ) # rank-biserial
# If significant, perform a Brunner-Munzel test for our interest
if result . pvalue < 0.05 and brunnermunzel :
result_bm = stats . brunnermunzel ( data1 , data2 )
if result_bm . pvalue > = 0.05 :
warnings . warn ( ' Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location. ' )
return MannWhitneyResult (
statistic = min ( u1 , u2 ) , pvalue = result . pvalue ,
#med1=data1.median(), med2=data2.median(),
rank_biserial = r , direction = ( ' {1} > {0} ' if u1 < u2 else ' {0} > {1} ' ) . format ( group1 , group2 ) ,
brunnermunzel = BrunnerMunzelResult ( statistic = result_bm . statistic , pvalue = result_bm . pvalue ) )
return MannWhitneyResult (
statistic = min ( u1 , u2 ) , pvalue = result . pvalue ,
2022-11-09 22:25:17 +11:00
dep = dep , ind = ind ,
2022-11-09 22:08:27 +11:00
group1 = group1 , group2 = group2 ,
2022-11-09 22:10:15 +11:00
med1 = data1 . median ( ) , med2 = data2 . median ( ) , iqr1 = Interval ( data1 . quantile ( 0.25 ) , data1 . quantile ( 0.75 ) ) , iqr2 = Interval ( data2 . quantile ( 0.25 ) , data2 . quantile ( 0.75 ) ) , range1 = Interval ( data1 . min ( ) , data1 . max ( ) ) , range2 = Interval ( data2 . min ( ) , data2 . max ( ) ) ,
2022-10-13 13:14:51 +11:00
rank_biserial = r , direction = ( ' {1} > {0} ' if u1 < u2 else ' {0} > {1} ' ) . format ( group1 , group2 ) )
2022-10-13 13:25:24 +11:00
# ------------------------
# Pearson chi-squared test
2022-12-02 20:20:25 +11:00
class ChiSquaredResult :
2023-02-26 00:05:10 +11:00
"""
Result of a generic test with * χ * : sup : ` 2 ` - distributed test statistic
2023-04-21 15:27:42 +10:00
See : meth : ` yli . logrank ` , : meth : ` yli . regress . RegressionModel . deviance_chi2 ` .
2023-02-26 00:05:10 +11:00
See also : class : ` yli . regress . BrantResult ` , : class : ` yli . regress . LikelihoodRatioTestResult ` , : class : ` PearsonChiSquaredResult ` .
"""
2022-12-02 20:20:25 +11:00
def __init__ ( self , statistic , dof , pvalue ) :
#: *χ*:sup:`2` statistic (*float*)
self . statistic = statistic
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
self . dof = dof
#: *p* value for the *χ*:sup:`2` test (*float*)
self . pvalue = pvalue
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
def _repr_html_ ( self ) :
return ' <i>χ</i><sup>2</sup>( {} ) = {:.2f} ; <i>p</i> {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
def summary ( self ) :
"""
Return a stringified summary of the * χ * : sup : ` 2 ` test
: rtype : str
"""
2023-02-25 17:23:20 +11:00
return ' χ²( {} ) = {:.2f} ; p {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-12-02 20:20:25 +11:00
class PearsonChiSquaredResult ( ChiSquaredResult ) :
2022-10-17 21:41:19 +11:00
"""
Result of a Pearson * χ * : sup : ` 2 ` test
See : func : ` yli . chi2 ` .
"""
2022-10-13 13:25:24 +11:00
def __init__ ( self , ct , statistic , dof , pvalue , oddsratio = None , riskratio = None ) :
2022-12-02 20:20:25 +11:00
super ( ) . __init__ ( statistic , dof , pvalue )
2022-10-17 21:41:19 +11:00
#: Contingency table for the observations (*DataFrame*)
2022-10-13 13:25:24 +11:00
self . ct = ct
2022-10-17 21:41:19 +11:00
#: Odds ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self . oddsratio = oddsratio
2022-10-17 21:41:19 +11:00
#: Risk ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self . riskratio = riskratio
def _repr_html_ ( self ) :
if self . oddsratio is not None :
2022-10-15 23:24:29 +11:00
return ' {0} <br><i>χ</i><sup>2</sup>( {1} ) = {2:.2f} ; <i>p</i> {3} <br>OR ( {4:g} % CI) = {5} <br>RR ( {4:g} % CI) = {6} ' . format (
2022-11-09 17:39:33 +11:00
self . ct . _repr_html_ ( ) , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) , ( 1 - config . alpha ) * 100 , self . oddsratio . summary ( ) , self . riskratio . summary ( ) )
2022-10-13 13:25:24 +11:00
else :
return ' {} <br><i>χ</i><sup>2</sup>( {} ) = {:.2f} ; <i>p</i> {} ' . format (
2022-11-09 17:39:33 +11:00
self . ct . _repr_html_ ( ) , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-13 13:25:24 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the * χ * : sup : ` 2 ` test
: rtype : str
"""
2022-10-13 13:25:24 +11:00
if self . oddsratio is not None :
2022-10-18 19:25:12 +11:00
return ' {0} \n \n χ²( {1} ) = {2:.2f} ; p {3} \n OR ( {4:g} % CI) = {5} \n RR ( {4:g} % CI) = {6} ' . format (
2022-11-09 17:39:33 +11:00
self . ct , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) , ( 1 - config . alpha ) * 100 , self . oddsratio . summary ( ) , self . riskratio . summary ( ) )
2022-10-13 13:25:24 +11:00
else :
2022-10-18 19:25:12 +11:00
return ' {} \n \n χ²( {} ) = {:.2f} ; p {} ' . format (
2022-11-09 17:39:33 +11:00
self . ct , self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-11-09 23:31:27 +11:00
def summary_short ( self , html ) :
"""
Return a stringified summary of the * χ * : sup : ` 2 ` test ( * χ * : sup : ` 2 ` statistic only )
: rtype : str
"""
if html :
return ' <i>χ</i><sup>2</sup>( {} ) = {:.2f} ' . format ( self . dof , self . statistic )
else :
return ' χ²( {} ) = {:.2f} ' . format ( self . dof , self . statistic )
2022-10-13 13:25:24 +11:00
def chi2 ( df , dep , ind , * , nan_policy = ' warn ' ) :
2022-10-17 21:41:19 +11:00
"""
Perform a Pearson * χ * : sup : ` 2 ` test
If a 2 × 2 contingency table is obtained ( i . e . if both variables are dichotomous ) , an odds ratio and risk ratio are calculated .
2022-10-18 19:25:12 +11:00
The ratios are calculated for the higher - valued value in each variable ( i . e . * True * compared with * False * for a boolean ) .
The risk ratio is calculated relative to the independent variable ( rows of the contingency table ) .
2022-10-17 21:41:19 +11:00
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( categorical )
: type dep : str
: param ind : Column in * df * for the independent variable ( categorical )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . PearsonChiSquaredResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Response ' : np . repeat ( [ False , True , False , True ] , [ 250 , 750 , 400 , 1600 ] ) ,
' Stress ' : np . repeat ( [ False , False , True , True ] , [ 250 , 750 , 400 , 1600 ] )
} )
yli . chi2 ( df , ' Stress ' , ' Response ' )
. . code - block : : text
Stress False True
Response
False 250 400
True 750 1600
χ² ( 1 ) = 9.82 ; p = 0.002 *
OR ( 95 % CI ) = 1.33 ( 1.11 – 1.60 )
RR ( 95 % CI ) = 1.11 ( 1.03 – 1.18 )
The output shows the contingency table , and states that the value of the Pearson * χ * : sup : ` 2 ` statistic is 9.82 , the * χ * : sup : ` 2 ` distribution has 1 degree of freedom , and the test is significant with * p * value 0.002 .
The odds of * Stress * in the * Response * = * True * group are 1.33 times that in the * Response * = * False * group , with 95 % confidence interval 1.11 – 1.60 .
2022-12-03 20:01:05 +11:00
The risk of * Stress * in the * Response * = * True * group is 1.11 times that in the * Response * = * False * group , with 95 % confidence interval 1.03 – 1.18 .
2022-10-17 21:41:19 +11:00
"""
2022-10-13 13:25:24 +11:00
# Check for/clean NaNs
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
# Compute contingency table
ct = pd . crosstab ( df [ ind ] , df [ dep ] )
# Get expected counts
expected = stats . contingency . expected_freq ( ct )
# Warn on low expected counts
if ( expected < 5 ) . sum ( ) / expected . size > 0.2 :
warnings . warn ( ' {} of {} cells ( {:.0f} % ) have expected count < 5 ' . format ( ( expected < 5 ) . sum ( ) , expected . size , ( expected < 5 ) . sum ( ) / expected . size * 100 ) )
if ( expected < 1 ) . any ( ) :
warnings . warn ( ' {} cells have expected count < 1 ' . format ( ( expected < 1 ) . sum ( ) ) )
if ct . shape == ( 2 , 2 ) :
# 2x2 table
2022-10-14 20:08:01 +11:00
# Use statsmodels to get OR and RR
2022-10-13 13:25:24 +11:00
smct = sm . stats . Table2x2 ( np . flip ( ct . to_numpy ( ) ) , shift_zeros = False )
result = smct . test_nominal_association ( )
2022-10-15 23:24:29 +11:00
ORci = smct . oddsratio_confint ( config . alpha )
RRci = smct . riskratio_confint ( config . alpha )
2022-10-13 13:25:24 +11:00
return PearsonChiSquaredResult (
ct = ct , statistic = result . statistic , dof = result . df , pvalue = result . pvalue ,
oddsratio = Estimate ( smct . oddsratio , ORci [ 0 ] , ORci [ 1 ] ) , riskratio = Estimate ( smct . riskratio , RRci [ 0 ] , RRci [ 1 ] ) )
else :
# rxc table
# Just use SciPy
result = stats . chi2_contingency ( ct , correction = False )
return PearsonChiSquaredResult ( ct = ct , statistic = result [ 0 ] , dof = result [ 2 ] , pvalue = result [ 1 ] )
2022-10-14 20:18:25 +11:00
# -------------------
# Pearson correlation
class PearsonRResult :
2022-10-17 21:41:19 +11:00
"""
2022-12-03 20:01:05 +11:00
Result of Pearson product - moment correlation
2022-10-17 21:41:19 +11:00
See : func : ` yli . pearsonr ` .
"""
2022-10-14 20:18:25 +11:00
def __init__ ( self , statistic , pvalue ) :
2022-12-03 20:01:05 +11:00
#: Pearson *r* correlation statistic (:class:`yli.utils.Estimate`)
2022-10-14 20:18:25 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *r* statistic (*float*)
2022-10-14 20:18:25 +11:00
self . pvalue = pvalue
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-14 20:18:25 +11:00
def _repr_html_ ( self ) :
2022-11-09 17:39:33 +11:00
return ' <i>r</i> ( {:g} % CI) = {} ; <i>p</i> {} ' . format ( ( 1 - config . alpha ) * 100 , self . statistic . summary ( ) , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-14 20:18:25 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the Pearson correlation
: rtype : str
"""
2022-11-09 17:39:33 +11:00
return ' r ( {:g} % CI) = {} ; p {} ' . format ( ( 1 - config . alpha ) * 100 , self . statistic . summary ( ) , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-10-14 20:18:25 +11:00
def pearsonr ( df , dep , ind , * , nan_policy = ' warn ' ) :
2022-10-17 21:41:19 +11:00
"""
2022-12-03 20:01:05 +11:00
Compute the Pearson product - moment correlation coefficient ( Pearson ' s *r*)
2022-10-17 21:41:19 +11:00
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numerical )
: type dep : str
: param ind : Column in * df * for the independent variable ( numerical )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . PearsonRResult `
2022-10-20 20:57:57 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' y ' : [ 41 , 39 , 47 , 51 , 43 , 40 , 57 , 46 , 50 , 59 , 61 , 52 ] ,
' x ' : [ 24 , 30 , 33 , 35 , 36 , 36 , 37 , 37 , 38 , 40 , 43 , 49 ]
} )
yli . pearsonr ( df , ' y ' , ' x ' )
. . code - block : : text
r ( 95 % CI ) = 0.65 ( 0.11 – 0.89 ) ; p = 0.02 *
The output states that the value of the Pearson correlation coefficient is 0.65 , with 95 % confidence interval 0.11 – 0.89 , and the test is significant with * p * value 0.02 .
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
# Compute Pearson's r
result = stats . pearsonr ( df [ ind ] , df [ dep ] )
ci = result . confidence_interval ( )
return PearsonRResult ( statistic = Estimate ( result . statistic , ci . low , ci . high ) , pvalue = result . pvalue )
2022-12-03 20:00:29 +11:00
# --------------------
# Spearman correlation
class SpearmanResult :
2022-12-03 20:01:05 +11:00
"""
Result of Spearman rank correlation
See : func : ` yli . spearman ` .
"""
2022-12-03 20:00:29 +11:00
def __init__ ( self , statistic , pvalue ) :
2022-12-03 20:01:05 +11:00
#: Spearman *ρ* correlation statistic (:class:`yli.utils.Estimate`)
2022-12-03 20:00:29 +11:00
self . statistic = statistic
2022-12-03 20:01:05 +11:00
#: *p* value for the *ρ* statistic (*float*)
2022-12-03 20:00:29 +11:00
self . pvalue = pvalue
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
def _repr_html_ ( self ) :
return ' <i>ρ</i> ( {:g} % CI) = {} ; <i>p</i> {} ' . format ( ( 1 - config . alpha ) * 100 , self . statistic . summary ( ) , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
def summary ( self ) :
"""
Return a stringified summary of the Spearman correlation
: rtype : str
"""
return ' ρ ( {:g} % CI) = {} ; p {} ' . format ( ( 1 - config . alpha ) * 100 , self . statistic . summary ( ) , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
def spearman ( df , dep , ind , * , nan_policy = ' warn ' ) :
2022-12-03 20:01:05 +11:00
"""
Compute the Spearman rank correlation coefficient ( Spearman ' s *ρ*)
The confidence interval for * ρ * is computed analogously to SciPy ' s *pearsonr*, using the Fisher transformation and normal approximation, without adjustment to variance.
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numerical )
: type dep : str
: param ind : Column in * df * for the independent variable ( numerical )
: type ind : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . SpearmanResult `
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Profit ' : [ 2.5 , 6.2 , 3.1 , . . . ] ,
' Quality ' : [ 50 , 57 , 61 , . . . ]
} )
yli . spearman ( df , ' Profit ' , ' Quality ' )
. . code - block : : text
ρ ( 95 % CI ) = 0.87 ( 0.60 – 0.96 ) ; p < 0.001 *
The output states that the value of the Spearman correlation coefficient is 0.87 , with 95 % confidence interval 0.60 – 0.96 , and the test is significant with * p * value < 0.001 .
"""
2022-12-03 20:00:29 +11:00
# Check for/clean NaNs
df = check_nan ( df [ [ ind , dep ] ] , nan_policy )
# Ensure numeric, factorising categorical variables as required
ind , _ = as_numeric ( df [ ind ] )
dep , _ = as_numeric ( df [ dep ] )
# Compute Spearman's rho
result = stats . spearmanr ( ind , dep )
# Compute confidence interval
ci = stats . _stats_py . _pearsonr_fisher_ci ( result . correlation , len ( dep ) , 1 - config . alpha , ' two-sided ' )
return SpearmanResult ( statistic = Estimate ( result . correlation , ci . low , ci . high ) , pvalue = result . pvalue )
2022-11-09 23:31:27 +11:00
# ----------------------------
# Automatic selection of tests
class AutoBinaryResult :
"""
Result of automatically computed univariable tests of association for a dichotomous dependent variable
See : func : ` yli . auto_univariable ` .
Results data stored within instances of this class is not intended to be directly accessed .
"""
def __init__ ( self , * , dep , group1 , group2 , result_data , result_labels ) :
#: Name of the dependent variable (*str*)
self . dep = dep
#: Name of the first group (*str*)
self . group1 = group1
#: Name of the second group (*str*)
self . group2 = group2
# List of tuples (first group summary, second group summary, test result)
self . _result_data = result_data
2022-11-10 21:20:06 +11:00
# List of tuples (plaintext label, HTML label)
2022-11-09 23:31:27 +11:00
self . _result_labels = result_labels
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
def _repr_html_ ( self ) :
result = ' <table><thead><tr><th> {} </th><th> {} </th><th> {} </th><th></th><th style= " text-align:center " ><i>p</i></th></tr></thead><tbody> ' . format ( self . dep , self . group1 , self . group2 )
for data , label in zip ( self . _result_data , self . _result_labels ) :
result + = ' <tr><th> {} </th><td> {} </td><td> {} </td><td style= " text-align:left " > {} </td><td style= " text-align:left " > {} </td></tr> ' . format ( label [ 1 ] , data [ 0 ] , data [ 1 ] , data [ 2 ] . summary_short ( True ) , fmt_p ( data [ 2 ] . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
result + = ' </tbody></table> '
return result
def summary ( self ) :
"""
Return a stringified summary of the tests of association
: rtype : str
"""
# Format data for output
result_data_fmt = [
[ r [ 0 ] , r [ 1 ] , r [ 2 ] . summary_short ( False ) , fmt_p ( r [ 2 ] . pvalue , PValueStyle . TABULAR ) ]
for r in self . _result_data
]
result_labels_fmt = [ r [ 0 ] for r in self . _result_labels ]
table = pd . DataFrame ( result_data_fmt , index = result_labels_fmt , columns = pd . Index ( [ self . group1 , self . group2 , ' ' , ' p ' ] , name = self . dep ) )
return str ( table )
2023-02-07 18:49:57 +11:00
def auto_univariable ( df , dep , inds , * , nan_policy = ' warn ' ) :
2022-11-09 23:31:27 +11:00
"""
Automatically compute univariable tests of association for a dichotomous dependent variable
The tests performed are :
* For a dichotomous independent variable – : func : ` yli . chi2 `
* For a continuous independent variable – : func : ` yli . ttest_ind `
* For an ordinal independent variable – : func : ` yli . mannwhitney `
2022-11-10 18:47:30 +11:00
If * nan_policy * is * warn * or * omit * , rows with * nan * values are omitted only from the individual tests of association for the missing variables .
2022-11-09 23:31:27 +11:00
: param df : Data to perform the test on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( dichotomous )
: type dep : str
: param inds : Columns in * df * for the independent variables
: type inds : List [ str ]
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: rtype : : class : ` yli . sig_tests . AutoBinaryResult `
"""
2022-11-10 18:47:30 +11:00
# Check for/clean NaNs in dependent variable
df = check_nan ( df [ inds + [ dep ] ] , nan_policy , cols = [ dep ] )
2022-11-09 23:31:27 +11:00
# Ensure 2 groups for dep
# TODO: Work for non-binary dependent variables?
group1 , data1 , group2 , data2 = as_2groups ( df , inds , dep )
result_data = [ ]
result_labels = [ ]
for ind in inds :
2022-11-10 18:47:30 +11:00
# Check for/clean NaNs in independent variable
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan ( df , nan_policy , cols = [ ind ] )
2023-02-07 18:49:57 +11:00
if df_cleaned [ ind ] . dtype == ' category ' and df_cleaned [ ind ] . cat . ordered and df_cleaned [ ind ] . cat . categories . dtype in ( ' float64 ' , ' int64 ' , ' Float64 ' , ' Int64 ' ) :
# Ordinal numeric data
# Mann-Whitney test
result = mannwhitney ( df_cleaned , ind , dep , nan_policy = ' raise ' )
result_labels . append ( (
' {} , median (IQR) ' . format ( ind ) ,
' {} , median (IQR) ' . format ( ind ) ,
) )
result_data . append ( (
' {:.2f} ( {} ) ' . format ( result . med1 , result . iqr1 . summary ( ) ) ,
' {:.2f} ( {} ) ' . format ( result . med2 , result . iqr2 . summary ( ) ) ,
result
) )
elif df_cleaned [ ind ] . dtype in ( ' bool ' , ' boolean ' , ' category ' , ' object ' ) :
# Categorical data
2022-11-09 23:31:27 +11:00
# Pearson chi-squared test
2022-11-10 18:47:30 +11:00
result = chi2 ( df_cleaned , dep , ind , nan_policy = ' raise ' )
2022-11-09 23:31:27 +11:00
2022-11-10 18:47:30 +11:00
values = sorted ( df_cleaned [ ind ] . unique ( ) )
2022-11-09 23:31:27 +11:00
# Value counts
result_labels . append ( (
' {} , {} ' . format ( ind , ' : ' . join ( str ( v ) for v in values ) ) ,
' {} , {} ' . format ( ind , ' : ' . join ( str ( v ) for v in values ) ) ,
) )
result_data . append ( (
' : ' . join ( str ( ( data1 [ ind ] == v ) . sum ( ) ) for v in values ) ,
' : ' . join ( str ( ( data2 [ ind ] == v ) . sum ( ) ) for v in values ) ,
result
) )
2022-11-10 18:47:30 +11:00
elif df_cleaned [ ind ] . dtype in ( ' float64 ' , ' int64 ' , ' Float64 ' , ' Int64 ' ) :
2023-02-07 18:49:57 +11:00
# Continuous data
# t test
result = ttest_ind ( df_cleaned , ind , dep , nan_policy = ' raise ' )
result_labels . append ( (
' {} , μ (SD) ' . format ( ind ) ,
' {} , <i>μ</i> (SD) ' . format ( ind ) ,
) )
result_data . append ( (
' {:.2f} ( {:.2f} ) ' . format ( result . mu1 , result . sd1 ) ,
' {:.2f} ( {:.2f} ) ' . format ( result . mu2 , result . sd2 ) ,
result
) )
2022-11-09 23:31:27 +11:00
else :
raise Exception ( ' Unsupported independent dtype for auto_univariable, {} ' . format ( df [ ind ] . dtype ) )
return AutoBinaryResult ( dep = dep , group1 = group1 , group2 = group2 , result_data = result_data , result_labels = result_labels )