Add example code/output to documentation
This commit is contained in:
parent
62c23efebc
commit
d5ce46f6d0
@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import itertools
|
import itertools
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||||
from .config import config
|
from .config import config
|
||||||
@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'):
|
|||||||
:param formula: If specified, calculate the VIF only for the variables in the formula
|
:param formula: If specified, calculate the VIF only for the variables in the formula
|
||||||
:type formula: str
|
:type formula: str
|
||||||
|
|
||||||
:return: The variance inflation factor
|
:return: The variance inflation factors
|
||||||
:rtype: float
|
:rtype: Series
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'D': [68.58, 67.33, 67.33, ...],
|
||||||
|
'T1': [14, 10, 10, ...],
|
||||||
|
'T2': [46, 73, 85, ...],
|
||||||
|
...
|
||||||
|
})
|
||||||
|
yli.vif(df[['D', 'T1', 'T2', ...]])
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
D 8.318301
|
||||||
|
T1 6.081590
|
||||||
|
T2 2.457122
|
||||||
|
...
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
The output shows the variance inflation factor for each variable in *df*.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if formula:
|
if formula:
|
||||||
@ -444,6 +467,40 @@ def regress(
|
|||||||
:type exp: bool
|
:type exp: bool
|
||||||
|
|
||||||
:rtype: :class:`yli.regress.RegressionResult`
|
:rtype: :class:`yli.regress.RegressionResult`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Unhealthy': [False, False, False, ...],
|
||||||
|
'Fibrinogen': [2.52, 2.46, 2.29, ...],
|
||||||
|
'GammaGlobulin': [38, 36, 36, ...]
|
||||||
|
})
|
||||||
|
yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
Logistic Regression Results
|
||||||
|
======================================================
|
||||||
|
Dep. Variable: Unhealthy | No. Observations: 32
|
||||||
|
Model: Logit | Df. Model: 2
|
||||||
|
Method: MLE | Df. Residuals: 29
|
||||||
|
Date: 2022-10-18 | Pseudo R²: 0.26
|
||||||
|
Time: 19:00:34 | LL-Model: -11.47
|
||||||
|
Std. Errors: Non-Robust | LL-Null: -15.44
|
||||||
|
| p (LR): 0.02*
|
||||||
|
======================================================
|
||||||
|
exp(β) (95% CI) p
|
||||||
|
-----------------------------------------------
|
||||||
|
(Intercept) 0.00 (0.00 - 0.24) 0.03*
|
||||||
|
Fibrinogen 6.80 (1.01 - 45.79) 0.049*
|
||||||
|
GammaGlobulin 1.17 (0.92 - 1.48) 0.19
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
|
The output summarises the results of the regression.
|
||||||
|
Note that the parameter estimates are automatically exponentiated.
|
||||||
|
For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Populate model_kwargs
|
# Populate model_kwargs
|
||||||
@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
|
|||||||
Uses the R *logistf* library.
|
Uses the R *logistf* library.
|
||||||
|
|
||||||
This class should be used in conjunction with :func:`yli.regress`.
|
This class should be used in conjunction with :func:`yli.regress`.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Pred': [1] * 20 + [0] * 220,
|
||||||
|
'Outcome': [1] * 40 + [0] * 200
|
||||||
|
})
|
||||||
|
yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
Penalised Logistic Regression Results
|
||||||
|
=========================================================
|
||||||
|
Dep. Variable: Outcome | No. Observations: 240
|
||||||
|
Model: Logit | Df. Model: 1
|
||||||
|
Method: Penalised ML | Pseudo R²: 0.37
|
||||||
|
Date: 2022-10-19 | LL-Model: -66.43
|
||||||
|
Time: 07:50:40 | LL-Null: -105.91
|
||||||
|
Std. Errors: Non-Robust | p (LR): <0.001*
|
||||||
|
=========================================================
|
||||||
|
β (95% CI) p
|
||||||
|
---------------------------------------------
|
||||||
|
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
|
||||||
|
Pred 5.99 (3.95 - 10.85) <0.001*
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
The output summarises the result of the regression.
|
||||||
|
The summary table shows that a penalised method was applied.
|
||||||
|
Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def fit(self, disp=False):
|
def fit(self, disp=False):
|
||||||
|
@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
|||||||
:type nan_policy: str
|
:type nan_policy: str
|
||||||
|
|
||||||
:rtype: :class:`yli.sig_tests.TTestResult`
|
:rtype: :class:`yli.sig_tests.TTestResult`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
|
||||||
|
'Potency': [10.2, 10.5, 10.3, ...]
|
||||||
|
})
|
||||||
|
yli.ttest_ind(df, 'Potency', 'Type')
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
t(18) = 4.24; p < 0.001*
|
||||||
|
Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
|
||||||
|
|
||||||
|
The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
|
||||||
|
The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
@ -129,7 +147,7 @@ class FTestResult:
|
|||||||
return super().__repr__()
|
return super().__repr__()
|
||||||
|
|
||||||
def _repr_html_(self):
|
def _repr_html_(self):
|
||||||
return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
|
return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
|
||||||
|
|
||||||
def summary(self):
|
def summary(self):
|
||||||
"""
|
"""
|
||||||
@ -138,7 +156,7 @@ class FTestResult:
|
|||||||
:rtype: str
|
:rtype: str
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
|
return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
|
||||||
|
|
||||||
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
|
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
|
||||||
"""
|
"""
|
||||||
@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'):
|
|||||||
:type nan_policy: str
|
:type nan_policy: str
|
||||||
|
|
||||||
:rtype: :class:`yli.sig_tests.FTestResult`
|
:rtype: :class:`yli.sig_tests.FTestResult`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Method': [1]*8 + [2]*7 + [3]*9,
|
||||||
|
'Score': [96, 79, 91, ...]
|
||||||
|
})
|
||||||
|
yli.anova_oneway(df, 'Score', 'Method')
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
F(2, 21) = 29.57; p < 0.001*
|
||||||
|
|
||||||
|
The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
@ -182,7 +216,7 @@ class MannWhitneyResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
|
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
|
||||||
#: Mann–Whitney *U* statistic (*float*)
|
#: Lesser of the two Mann–Whitney *U* statistics (*float*)
|
||||||
self.statistic = statistic
|
self.statistic = statistic
|
||||||
#: *p* value for the *U* statistic (*float*)
|
#: *p* value for the *U* statistic (*float*)
|
||||||
self.pvalue = pvalue
|
self.pvalue = pvalue
|
||||||
@ -212,7 +246,7 @@ class MannWhitneyResult:
|
|||||||
:rtype: str
|
:rtype: str
|
||||||
"""
|
"""
|
||||||
|
|
||||||
line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
|
line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
|
||||||
if self.brunnermunzel:
|
if self.brunnermunzel:
|
||||||
return line1 + '\n' + self.brunnermunzel.summary()
|
return line1 + '\n' + self.brunnermunzel.summary()
|
||||||
else:
|
else:
|
||||||
@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
|
|||||||
:param method: See *scipy.stats.mannwhitneyu*
|
:param method: See *scipy.stats.mannwhitneyu*
|
||||||
|
|
||||||
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
|
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Sample': ['Before'] * 12 + ['After'] * 12,
|
||||||
|
'Oxygen': [11.0, 11.2, 11.2, ...]
|
||||||
|
})
|
||||||
|
yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
U = 6.0; p < 0.001*
|
||||||
|
r = 0.92, Before > After
|
||||||
|
|
||||||
|
The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
|
||||||
|
The rank-biserial correlation is 0.92 in favour of the *Before* group.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
@ -352,10 +404,10 @@ class PearsonChiSquaredResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if self.oddsratio is not None:
|
if self.oddsratio is not None:
|
||||||
return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
|
return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
|
||||||
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
|
||||||
else:
|
else:
|
||||||
return '{}\nχ²({}) = {:.2f}; p {}'.format(
|
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
|
||||||
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
|
||||||
|
|
||||||
def chi2(df, dep, ind, *, nan_policy='warn'):
|
def chi2(df, dep, ind, *, nan_policy='warn'):
|
||||||
@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
|
|||||||
Perform a Pearson *χ*:sup:`2` test
|
Perform a Pearson *χ*:sup:`2` test
|
||||||
|
|
||||||
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
|
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
|
||||||
The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean).
|
The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
|
||||||
The risk ratio is calculated relative to the independent variable.
|
The risk ratio is calculated relative to the independent variable (rows of the contingency table).
|
||||||
|
|
||||||
:param df: Data to perform the test on
|
:param df: Data to perform the test on
|
||||||
:type df: DataFrame
|
:type df: DataFrame
|
||||||
@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
|
|||||||
:type nan_policy: str
|
:type nan_policy: str
|
||||||
|
|
||||||
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
|
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
|
||||||
|
'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
|
||||||
|
})
|
||||||
|
yli.chi2(df, 'Stress', 'Response')
|
||||||
|
|
||||||
|
.. code-block:: text
|
||||||
|
|
||||||
|
Stress False True
|
||||||
|
Response
|
||||||
|
False 250 400
|
||||||
|
True 750 1600
|
||||||
|
|
||||||
|
χ²(1) = 9.82; p = 0.002*
|
||||||
|
OR (95% CI) = 1.33 (1.11–1.60)
|
||||||
|
RR (95% CI) = 1.11 (1.03–1.18)
|
||||||
|
|
||||||
|
The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
|
||||||
|
|
||||||
|
The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
|
||||||
|
|
||||||
|
The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
@ -448,7 +527,7 @@ class PearsonRResult:
|
|||||||
|
|
||||||
def pearsonr(df, dep, ind, *, nan_policy='warn'):
|
def pearsonr(df, dep, ind, *, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
Compute the Pearson correlation coefficient (Pearson's r)
|
Compute the Pearson correlation coefficient (Pearson's *r*)
|
||||||
|
|
||||||
:param df: Data to perform the test on
|
:param df: Data to perform the test on
|
||||||
:type df: DataFrame
|
:type df: DataFrame
|
||||||
|
Loading…
Reference in New Issue
Block a user