Add example code/output to documentation

This commit is contained in:
RunasSudo 2022-10-18 19:25:12 +11:00
parent 62c23efebc
commit d5ce46f6d0
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
2 changed files with 178 additions and 11 deletions

View File

@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
from datetime import datetime from datetime import datetime
import itertools import itertools
import warnings
from .bayes_factors import BayesFactor, bayesfactor_afbf from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config from .config import config
@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'):
:param formula: If specified, calculate the VIF only for the variables in the formula :param formula: If specified, calculate the VIF only for the variables in the formula
:type formula: str :type formula: str
:return: The variance inflation factor :return: The variance inflation factors
:rtype: float :rtype: Series
**Example:**
.. code-block::
df = pd.DataFrame({
'D': [68.58, 67.33, 67.33, ...],
'T1': [14, 10, 10, ...],
'T2': [46, 73, 85, ...],
...
})
yli.vif(df[['D', 'T1', 'T2', ...]])
.. code-block:: text
D 8.318301
T1 6.081590
T2 2.457122
...
dtype: float64
The output shows the variance inflation factor for each variable in *df*.
""" """
if formula: if formula:
@ -444,6 +467,40 @@ def regress(
:type exp: bool :type exp: bool
:rtype: :class:`yli.regress.RegressionResult` :rtype: :class:`yli.regress.RegressionResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Unhealthy': [False, False, False, ...],
'Fibrinogen': [2.52, 2.46, 2.29, ...],
'GammaGlobulin': [38, 36, 36, ...]
})
yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
.. code-block:: text
Logistic Regression Results
======================================================
Dep. Variable: Unhealthy | No. Observations: 32
Model: Logit | Df. Model: 2
Method: MLE | Df. Residuals: 29
Date: 2022-10-18 | Pseudo : 0.26
Time: 19:00:34 | LL-Model: -11.47
Std. Errors: Non-Robust | LL-Null: -15.44
| p (LR): 0.02*
======================================================
exp(β) (95% CI) p
-----------------------------------------------
(Intercept) 0.00 (0.00 - 0.24) 0.03*
Fibrinogen 6.80 (1.01 - 45.79) 0.049*
GammaGlobulin 1.17 (0.92 - 1.48) 0.19
-----------------------------------------------
The output summarises the results of the regression.
Note that the parameter estimates are automatically exponentiated.
For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.0145.79, and is significant with *p* value 0.049.
""" """
# Populate model_kwargs # Populate model_kwargs
@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
Uses the R *logistf* library. Uses the R *logistf* library.
This class should be used in conjunction with :func:`yli.regress`. This class should be used in conjunction with :func:`yli.regress`.
**Example:**
.. code-block::
df = pd.DataFrame({
'Pred': [1] * 20 + [0] * 220,
'Outcome': [1] * 40 + [0] * 200
})
yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
.. code-block:: text
Penalised Logistic Regression Results
=========================================================
Dep. Variable: Outcome | No. Observations: 240
Model: Logit | Df. Model: 1
Method: Penalised ML | Pseudo : 0.37
Date: 2022-10-19 | LL-Model: -66.43
Time: 07:50:40 | LL-Null: -105.91
Std. Errors: Non-Robust | p (LR): <0.001*
=========================================================
β (95% CI) p
---------------------------------------------
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
Pred 5.99 (3.95 - 10.85) <0.001*
---------------------------------------------
The output summarises the result of the regression.
The summary table shows that a penalised method was applied.
Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
""" """
def fit(self, disp=False): def fit(self, disp=False):

View File

@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
:type nan_policy: str :type nan_policy: str
:rtype: :class:`yli.sig_tests.TTestResult` :rtype: :class:`yli.sig_tests.TTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
'Potency': [10.2, 10.5, 10.3, ...]
})
yli.ttest_ind(df, 'Potency', 'Type')
.. code-block:: text
t(18) = 4.24; p < 0.001*
Δμ (95% CI) = 0.54 (0.270.81), Fresh > Stored
The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.270.81.
""" """
# Check for/clean NaNs # Check for/clean NaNs
@ -129,7 +147,7 @@ class FTestResult:
return super().__repr__() return super().__repr__()
def _repr_html_(self): def _repr_html_(self):
return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
def summary(self): def summary(self):
""" """
@ -138,7 +156,7 @@ class FTestResult:
:rtype: str :rtype: str
""" """
return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
def anova_oneway(df, dep, ind, *, nan_policy='omit'): def anova_oneway(df, dep, ind, *, nan_policy='omit'):
""" """
@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'):
:type nan_policy: str :type nan_policy: str
:rtype: :class:`yli.sig_tests.FTestResult` :rtype: :class:`yli.sig_tests.FTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Method': [1]*8 + [2]*7 + [3]*9,
'Score': [96, 79, 91, ...]
})
yli.anova_oneway(df, 'Score', 'Method')
.. code-block:: text
F(2, 21) = 29.57; p < 0.001*
The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
""" """
# Check for/clean NaNs # Check for/clean NaNs
@ -182,7 +216,7 @@ class MannWhitneyResult:
""" """
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None): def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
#: Mann–Whitney *U* statistic (*float*) #: Lesser of the two Mann–Whitney *U* statistics (*float*)
self.statistic = statistic self.statistic = statistic
#: *p* value for the *U* statistic (*float*) #: *p* value for the *U* statistic (*float*)
self.pvalue = pvalue self.pvalue = pvalue
@ -212,7 +246,7 @@ class MannWhitneyResult:
:rtype: str :rtype: str
""" """
line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction) line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
if self.brunnermunzel: if self.brunnermunzel:
return line1 + '\n' + self.brunnermunzel.summary() return line1 + '\n' + self.brunnermunzel.summary()
else: else:
@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
:param method: See *scipy.stats.mannwhitneyu* :param method: See *scipy.stats.mannwhitneyu*
:rtype: :class:`yli.sig_tests.MannWhitneyResult` :rtype: :class:`yli.sig_tests.MannWhitneyResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Sample': ['Before'] * 12 + ['After'] * 12,
'Oxygen': [11.0, 11.2, 11.2, ...]
})
yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
.. code-block:: text
U = 6.0; p < 0.001*
r = 0.92, Before > After
The output states that the value of the MannWhitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
The rank-biserial correlation is 0.92 in favour of the *Before* group.
""" """
# Check for/clean NaNs # Check for/clean NaNs
@ -352,10 +404,10 @@ class PearsonChiSquaredResult:
""" """
if self.oddsratio is not None: if self.oddsratio is not None:
return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
else: else:
return '{}\nχ²({}) = {:.2f}; p {}'.format( return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False)) self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
def chi2(df, dep, ind, *, nan_policy='warn'): def chi2(df, dep, ind, *, nan_policy='warn'):
@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
Perform a Pearson *χ*:sup:`2` test Perform a Pearson *χ*:sup:`2` test
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean). The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
The risk ratio is calculated relative to the independent variable. The risk ratio is calculated relative to the independent variable (rows of the contingency table).
:param df: Data to perform the test on :param df: Data to perform the test on
:type df: DataFrame :type df: DataFrame
@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
:type nan_policy: str :type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
})
yli.chi2(df, 'Stress', 'Response')
.. code-block:: text
Stress False True
Response
False 250 400
True 750 1600
χ²(1) = 9.82; p = 0.002*
OR (95% CI) = 1.33 (1.111.60)
RR (95% CI) = 1.11 (1.031.18)
The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.111.60.
The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.031.18.
""" """
# Check for/clean NaNs # Check for/clean NaNs
@ -448,7 +527,7 @@ class PearsonRResult:
def pearsonr(df, dep, ind, *, nan_policy='warn'): def pearsonr(df, dep, ind, *, nan_policy='warn'):
""" """
Compute the Pearson correlation coefficient (Pearson's r) Compute the Pearson correlation coefficient (Pearson's *r*)
:param df: Data to perform the test on :param df: Data to perform the test on
:type df: DataFrame :type df: DataFrame