Add example code/output to documentation

2022-10-18 19:25:12 +11:00 · 2022-10-18 19:25:12 +11:00 · d5ce46f6d0
commit d5ce46f6d0
parent 62c23efebc
2 changed files with 178 additions and 11 deletions
--- a/yli/regress.py
+++ b/yli/regress.py
@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
 from datetime import datetime
 import itertools
 import warnings
 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .config import config
@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'):
 	:param formula: If specified, calculate the VIF only for the variables in the formula
 	:type formula: str
-	:return: The variance inflation factor
+	:return: The variance inflation factors
-	:rtype: float
+	:rtype: Series
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'D': [68.58, 67.33, 67.33, ...],
 			'T1': [14, 10, 10, ...],
 			'T2': [46, 73, 85, ...],
 			...
 		})
 		yli.vif(df[['D', 'T1', 'T2', ...]])
 	.. code-block:: text
 		D     8.318301
 		T1    6.081590
 		T2    2.457122
 		...
 		dtype: float64
 	The output shows the variance inflation factor for each variable in *df*.
 	"""
 	if formula:
@ -444,6 +467,40 @@ def regress(
 	:type exp: bool
 	:rtype: :class:`yli.regress.RegressionResult`
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Unhealthy': [False, False, False, ...],
 			'Fibrinogen': [2.52, 2.46, 2.29, ...],
 			'GammaGlobulin': [38, 36, 36, ...]
 		})
 		yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
 	.. code-block:: text
 		             Logistic Regression Results              
 		======================================================
 		Dep. Variable:  Unhealthy  |  No. Observations:     32
 		        Model:      Logit  |         Df. Model:      2
 		       Method:        MLE  |     Df. Residuals:     29
 		         Date: 2022-10-18  |         Pseudo R²:   0.26
 		         Time:   19:00:34  |          LL-Model: -11.47
 		  Std. Errors: Non-Robust  |           LL-Null: -15.44
 		                           |            p (LR):  0.02*
 		======================================================
 		                exp(β)   (95% CI)          p   
 		-----------------------------------------------
 		  (Intercept)     0.00 (0.00 -  0.24)    0.03* 
 		   Fibrinogen     6.80 (1.01 - 45.79)    0.049*
 		GammaGlobulin     1.17 (0.92 -  1.48)    0.19  
 		-----------------------------------------------
 	The output summarises the results of the regression.
 	Note that the parameter estimates are automatically exponentiated.
 	For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
 	"""
 	# Populate model_kwargs
@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 	Uses the R *logistf* library.
 	This class should be used in conjunction with :func:`yli.regress`.
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Pred': [1] * 20 + [0] * 220,
 			'Outcome': [1] * 40 + [0] * 200
 		})
 		yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
 	.. code-block:: text
 		          Penalised Logistic Regression Results          
 		=========================================================
 		Dep. Variable:      Outcome  |  No. Observations:     240
 		        Model:        Logit  |         Df. Model:       1
 		       Method: Penalised ML  |         Pseudo R²:    0.37
 		         Date:   2022-10-19  |          LL-Model:  -66.43
 		         Time:     07:50:40  |           LL-Null: -105.91
 		  Std. Errors:   Non-Robust  |            p (LR): <0.001*
 		=========================================================
 		                β      (95% CI)          p   
 		---------------------------------------------
 		(Intercept)   -2.28 (-2.77 - -1.85)   <0.001*
 		       Pred    5.99  (3.95 - 10.85)   <0.001*
 		---------------------------------------------
 	The output summarises the result of the regression.
 	The summary table shows that a penalised method was applied.
 	Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
 	"""
 	def fit(self, disp=False):
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
 	:type nan_policy: str
 	:rtype: :class:`yli.sig_tests.TTestResult`
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Type': ['Fresh'] * 10 + ['Stored'] * 10,
 			'Potency': [10.2, 10.5, 10.3, ...]
 		})
 		yli.ttest_ind(df, 'Potency', 'Type')
 	.. code-block:: text
 		t(18) = 4.24; p < 0.001*
 		Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
 	The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
 	The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
 	"""
 	# Check for/clean NaNs
@ -129,7 +147,7 @@ class FTestResult:
 		return super().__repr__()
 	def _repr_html_(self):
-		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
+		return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
 	def summary(self):
 		"""
@ -138,7 +156,7 @@ class FTestResult:
 		:rtype: str
 		"""
-		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
+		return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
 def anova_oneway(df, dep, ind, *, nan_policy='omit'):
 	"""
@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'):
 	:type nan_policy: str
 	:rtype: :class:`yli.sig_tests.FTestResult`
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Method': [1]*8 + [2]*7 + [3]*9,
 			'Score': [96, 79, 91, ...]
 		})
 		yli.anova_oneway(df, 'Score', 'Method')
 	.. code-block:: text
 		F(2, 21) = 29.57; p < 0.001*
 	The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
 	"""
 	# Check for/clean NaNs
@ -182,7 +216,7 @@ class MannWhitneyResult:
 	"""
 	def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
-		#: Mann–Whitney *U* statistic (*float*)
+		#: Lesser of the two Mann–Whitney *U* statistics (*float*)
 		self.statistic = statistic
 		#: *p* value for the *U* statistic (*float*)
 		self.pvalue = pvalue
@ -212,7 +246,7 @@ class MannWhitneyResult:
 		:rtype: str
 		"""
-		line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
+		line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
 		if self.brunnermunzel:
 			return line1 + '\n' + self.brunnermunzel.summary()
 		else:
@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 	:param method: See *scipy.stats.mannwhitneyu*
 	:rtype: :class:`yli.sig_tests.MannWhitneyResult`
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Sample': ['Before'] * 12 + ['After'] * 12,
 			'Oxygen': [11.0, 11.2, 11.2, ...]
 		})
 		yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
 	.. code-block:: text
 		U = 6.0; p < 0.001*
 		r = 0.92, Before > After
 	The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
 	The rank-biserial correlation is 0.92 in favour of the *Before* group.
 	"""
 	# Check for/clean NaNs
@ -352,10 +404,10 @@ class PearsonChiSquaredResult:
 		"""
 		if self.oddsratio is not None:
-			return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
+			return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
 				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
 		else:
-			return '{}\nχ²({}) = {:.2f}; p {}'.format(
+			return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
 				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
 def chi2(df, dep, ind, *, nan_policy='warn'):
@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
 	Perform a Pearson *χ*:sup:`2` test
 	If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
-	The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean).
+	The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
-	The risk ratio is calculated relative to the independent variable.
+	The risk ratio is calculated relative to the independent variable (rows of the contingency table).
 	:param df: Data to perform the test on
 	:type df: DataFrame
@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
 	:type nan_policy: str
 	:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
 	**Example:**
 	.. code-block::
 		df = pd.DataFrame({
 			'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
 			'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
 		})
 		yli.chi2(df, 'Stress', 'Response')
 	.. code-block:: text
 		Stress    False  True
 		Response             
 		False       250   400
 		True        750  1600
 		χ²(1) = 9.82; p = 0.002*
 		OR (95% CI) = 1.33 (1.11–1.60)
 		RR (95% CI) = 1.11 (1.03–1.18)
 	The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
 	The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
 	The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
 	"""
 	# Check for/clean NaNs
@ -448,7 +527,7 @@ class PearsonRResult:
 def pearsonr(df, dep, ind, *, nan_policy='warn'):
 	"""
-	Compute the Pearson correlation coefficient (Pearson's r)
+	Compute the Pearson correlation coefficient (Pearson's *r*)
 	:param df: Data to perform the test on
 	:type df: DataFrame