diff --git a/yli/regress.py b/yli/regress.py
index 1e89ccd..b7af386 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
from datetime import datetime
import itertools
+import warnings
from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config
@@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'):
:param formula: If specified, calculate the VIF only for the variables in the formula
:type formula: str
- :return: The variance inflation factor
- :rtype: float
+ :return: The variance inflation factors
+ :rtype: Series
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'D': [68.58, 67.33, 67.33, ...],
+ 'T1': [14, 10, 10, ...],
+ 'T2': [46, 73, 85, ...],
+ ...
+ })
+ yli.vif(df[['D', 'T1', 'T2', ...]])
+
+ .. code-block:: text
+
+ D 8.318301
+ T1 6.081590
+ T2 2.457122
+ ...
+ dtype: float64
+
+ The output shows the variance inflation factor for each variable in *df*.
"""
if formula:
@@ -444,6 +467,40 @@ def regress(
:type exp: bool
:rtype: :class:`yli.regress.RegressionResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Unhealthy': [False, False, False, ...],
+ 'Fibrinogen': [2.52, 2.46, 2.29, ...],
+ 'GammaGlobulin': [38, 36, 36, ...]
+ })
+ yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+
+ .. code-block:: text
+
+ Logistic Regression Results
+ ======================================================
+ Dep. Variable: Unhealthy | No. Observations: 32
+ Model: Logit | Df. Model: 2
+ Method: MLE | Df. Residuals: 29
+ Date: 2022-10-18 | Pseudo R²: 0.26
+ Time: 19:00:34 | LL-Model: -11.47
+ Std. Errors: Non-Robust | LL-Null: -15.44
+ | p (LR): 0.02*
+ ======================================================
+ exp(β) (95% CI) p
+ -----------------------------------------------
+ (Intercept) 0.00 (0.00 - 0.24) 0.03*
+ Fibrinogen 6.80 (1.01 - 45.79) 0.049*
+ GammaGlobulin 1.17 (0.92 - 1.48) 0.19
+ -----------------------------------------------
+
+ The output summarises the results of the regression.
+ Note that the parameter estimates are automatically exponentiated.
+ For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
"""
# Populate model_kwargs
@@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
Uses the R *logistf* library.
This class should be used in conjunction with :func:`yli.regress`.
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Pred': [1] * 20 + [0] * 220,
+ 'Outcome': [1] * 40 + [0] * 200
+ })
+ yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
+
+ .. code-block:: text
+
+ Penalised Logistic Regression Results
+ =========================================================
+ Dep. Variable: Outcome | No. Observations: 240
+ Model: Logit | Df. Model: 1
+ Method: Penalised ML | Pseudo R²: 0.37
+ Date: 2022-10-19 | LL-Model: -66.43
+ Time: 07:50:40 | LL-Null: -105.91
+ Std. Errors: Non-Robust | p (LR): <0.001*
+ =========================================================
+ β (95% CI) p
+ ---------------------------------------------
+ (Intercept) -2.28 (-2.77 - -1.85) <0.001*
+ Pred 5.99 (3.95 - 10.85) <0.001*
+ ---------------------------------------------
+
+ The output summarises the result of the regression.
+ The summary table shows that a penalised method was applied.
+ Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
"""
def fit(self, disp=False):
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index f49e473..203ca27 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
:type nan_policy: str
:rtype: :class:`yli.sig_tests.TTestResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Type': ['Fresh'] * 10 + ['Stored'] * 10,
+ 'Potency': [10.2, 10.5, 10.3, ...]
+ })
+ yli.ttest_ind(df, 'Potency', 'Type')
+
+ .. code-block:: text
+
+ t(18) = 4.24; p < 0.001*
+ Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
+
+ The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
+ The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
"""
# Check for/clean NaNs
@@ -129,7 +147,7 @@ class FTestResult:
return super().__repr__()
def _repr_html_(self):
- return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
+ return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
def summary(self):
"""
@@ -138,7 +156,7 @@ class FTestResult:
:rtype: str
"""
- return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
+ return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
"""
@@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'):
:type nan_policy: str
:rtype: :class:`yli.sig_tests.FTestResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Method': [1]*8 + [2]*7 + [3]*9,
+ 'Score': [96, 79, 91, ...]
+ })
+ yli.anova_oneway(df, 'Score', 'Method')
+
+ .. code-block:: text
+
+ F(2, 21) = 29.57; p < 0.001*
+
+ The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
"""
# Check for/clean NaNs
@@ -182,7 +216,7 @@ class MannWhitneyResult:
"""
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
- #: Mann–Whitney *U* statistic (*float*)
+ #: Lesser of the two Mann–Whitney *U* statistics (*float*)
self.statistic = statistic
#: *p* value for the *U* statistic (*float*)
self.pvalue = pvalue
@@ -212,7 +246,7 @@ class MannWhitneyResult:
:rtype: str
"""
- line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
+ line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
if self.brunnermunzel:
return line1 + '\n' + self.brunnermunzel.summary()
else:
@@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
:param method: See *scipy.stats.mannwhitneyu*
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Sample': ['Before'] * 12 + ['After'] * 12,
+ 'Oxygen': [11.0, 11.2, 11.2, ...]
+ })
+ yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
+
+ .. code-block:: text
+
+ U = 6.0; p < 0.001*
+ r = 0.92, Before > After
+
+ The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
+ The rank-biserial correlation is 0.92 in favour of the *Before* group.
"""
# Check for/clean NaNs
@@ -352,10 +404,10 @@ class PearsonChiSquaredResult:
"""
if self.oddsratio is not None:
- return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
+ return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
else:
- return '{}\nχ²({}) = {:.2f}; p {}'.format(
+ return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
def chi2(df, dep, ind, *, nan_policy='warn'):
@@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
Perform a Pearson *χ*:sup:`2` test
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
- The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean).
- The risk ratio is calculated relative to the independent variable.
+ The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
+ The risk ratio is calculated relative to the independent variable (rows of the contingency table).
:param df: Data to perform the test on
:type df: DataFrame
@@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
+ 'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
+ })
+ yli.chi2(df, 'Stress', 'Response')
+
+ .. code-block:: text
+
+ Stress False True
+ Response
+ False 250 400
+ True 750 1600
+
+ χ²(1) = 9.82; p = 0.002*
+ OR (95% CI) = 1.33 (1.11–1.60)
+ RR (95% CI) = 1.11 (1.03–1.18)
+
+ The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
+
+ The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
+
+ The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
"""
# Check for/clean NaNs
@@ -448,7 +527,7 @@ class PearsonRResult:
def pearsonr(df, dep, ind, *, nan_policy='warn'):
"""
- Compute the Pearson correlation coefficient (Pearson's r)
+ Compute the Pearson correlation coefficient (Pearson's *r*)
:param df: Data to perform the test on
:type df: DataFrame