From d5ce46f6d09c8e36337768a55f02a004e1bc934f Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 18 Oct 2022 19:25:12 +1100 Subject: [PATCH] Add example code/output to documentation --- yli/regress.py | 92 ++++++++++++++++++++++++++++++++++++++++++++- yli/sig_tests.py | 97 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 178 insertions(+), 11 deletions(-) diff --git a/yli/regress.py b/yli/regress.py index 1e89ccd..b7af386 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor from datetime import datetime import itertools +import warnings from .bayes_factors import BayesFactor, bayesfactor_afbf from .config import config @@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'): :param formula: If specified, calculate the VIF only for the variables in the formula :type formula: str - :return: The variance inflation factor - :rtype: float + :return: The variance inflation factors + :rtype: Series + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'D': [68.58, 67.33, 67.33, ...], + 'T1': [14, 10, 10, ...], + 'T2': [46, 73, 85, ...], + ... + }) + yli.vif(df[['D', 'T1', 'T2', ...]]) + + .. code-block:: text + + D 8.318301 + T1 6.081590 + T2 2.457122 + ... + dtype: float64 + + The output shows the variance inflation factor for each variable in *df*. """ if formula: @@ -444,6 +467,40 @@ def regress( :type exp: bool :rtype: :class:`yli.regress.RegressionResult` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Unhealthy': [False, False, False, ...], + 'Fibrinogen': [2.52, 2.46, 2.29, ...], + 'GammaGlobulin': [38, 36, 36, ...] + }) + yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') + + .. code-block:: text + + Logistic Regression Results + ====================================================== + Dep. Variable: Unhealthy | No. Observations: 32 + Model: Logit | Df. Model: 2 + Method: MLE | Df. Residuals: 29 + Date: 2022-10-18 | Pseudo R²: 0.26 + Time: 19:00:34 | LL-Model: -11.47 + Std. Errors: Non-Robust | LL-Null: -15.44 + | p (LR): 0.02* + ====================================================== + exp(β) (95% CI) p + ----------------------------------------------- + (Intercept) 0.00 (0.00 - 0.24) 0.03* + Fibrinogen 6.80 (1.01 - 45.79) 0.049* + GammaGlobulin 1.17 (0.92 - 1.48) 0.19 + ----------------------------------------------- + + The output summarises the results of the regression. + Note that the parameter estimates are automatically exponentiated. + For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049. """ # Populate model_kwargs @@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): Uses the R *logistf* library. This class should be used in conjunction with :func:`yli.regress`. + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Pred': [1] * 20 + [0] * 220, + 'Outcome': [1] * 40 + [0] * 200 + }) + yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False) + + .. code-block:: text + + Penalised Logistic Regression Results + ========================================================= + Dep. Variable: Outcome | No. Observations: 240 + Model: Logit | Df. Model: 1 + Method: Penalised ML | Pseudo R²: 0.37 + Date: 2022-10-19 | LL-Model: -66.43 + Time: 07:50:40 | LL-Null: -105.91 + Std. Errors: Non-Robust | p (LR): <0.001* + ========================================================= + β (95% CI) p + --------------------------------------------- + (Intercept) -2.28 (-2.77 - -1.85) <0.001* + Pred 5.99 (3.95 - 10.85) <0.001* + --------------------------------------------- + + The output summarises the result of the regression. + The summary table shows that a penalised method was applied. + Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated. """ def fit(self, disp=False): diff --git a/yli/sig_tests.py b/yli/sig_tests.py index f49e473..203ca27 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'): :type nan_policy: str :rtype: :class:`yli.sig_tests.TTestResult` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Type': ['Fresh'] * 10 + ['Stored'] * 10, + 'Potency': [10.2, 10.5, 10.3, ...] + }) + yli.ttest_ind(df, 'Potency', 'Type') + + .. code-block:: text + + t(18) = 4.24; p < 0.001* + Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored + + The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001. + The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81. """ # Check for/clean NaNs @@ -129,7 +147,7 @@ class FTestResult: return super().__repr__() def _repr_html_(self): - return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) + return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): """ @@ -138,7 +156,7 @@ class FTestResult: :rtype: str """ - return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) + return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) def anova_oneway(df, dep, ind, *, nan_policy='omit'): """ @@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'): :type nan_policy: str :rtype: :class:`yli.sig_tests.FTestResult` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Method': [1]*8 + [2]*7 + [3]*9, + 'Score': [96, 79, 91, ...] + }) + yli.anova_oneway(df, 'Score', 'Method') + + .. code-block:: text + + F(2, 21) = 29.57; p < 0.001* + + The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001. """ # Check for/clean NaNs @@ -182,7 +216,7 @@ class MannWhitneyResult: """ def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None): - #: Mann–Whitney *U* statistic (*float*) + #: Lesser of the two Mann–Whitney *U* statistics (*float*) self.statistic = statistic #: *p* value for the *U* statistic (*float*) self.pvalue = pvalue @@ -212,7 +246,7 @@ class MannWhitneyResult: :rtype: str """ - line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction) + line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() else: @@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont :param method: See *scipy.stats.mannwhitneyu* :rtype: :class:`yli.sig_tests.MannWhitneyResult` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Sample': ['Before'] * 12 + ['After'] * 12, + 'Oxygen': [11.0, 11.2, 11.2, ...] + }) + yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less') + + .. code-block:: text + + U = 6.0; p < 0.001* + r = 0.92, Before > After + + The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001. + The rank-biserial correlation is 0.92 in favour of the *Before* group. """ # Check for/clean NaNs @@ -352,10 +404,10 @@ class PearsonChiSquaredResult: """ if self.oddsratio is not None: - return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( + return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: - return '{}\nχ²({}) = {:.2f}; p {}'.format( + return '{}\n\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False)) def chi2(df, dep, ind, *, nan_policy='warn'): @@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'): Perform a Pearson *χ*:sup:`2` test If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. - The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean). - The risk ratio is calculated relative to the independent variable. + The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean). + The risk ratio is calculated relative to the independent variable (rows of the contingency table). :param df: Data to perform the test on :type df: DataFrame @@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'): :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]), + 'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600]) + }) + yli.chi2(df, 'Stress', 'Response') + + .. code-block:: text + + Stress False True + Response + False 250 400 + True 750 1600 + + χ²(1) = 9.82; p = 0.002* + OR (95% CI) = 1.33 (1.11–1.60) + RR (95% CI) = 1.11 (1.03–1.18) + + The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002. + + The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60. + + The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18. """ # Check for/clean NaNs @@ -448,7 +527,7 @@ class PearsonRResult: def pearsonr(df, dep, ind, *, nan_policy='warn'): """ - Compute the Pearson correlation coefficient (Pearson's r) + Compute the Pearson correlation coefficient (Pearson's *r*) :param df: Data to perform the test on :type df: DataFrame