Add example code/output to documentation

2022-10-18 19:25:12 +11:00 · 2022-10-18 19:25:12 +11:00 · d5ce46f6d0
commit d5ce46f6d0
parent 62c23efebc
2 changed files with 178 additions and 11 deletions
--- a/yli/regress.py
+++ b/yli/regress.py
@ -24,6 +24,7 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor

 from datetime import datetime
 import itertools
+import warnings

 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .config import config
@ -39,8 +40,30 @@ def vif(df, formula=None, *, nan_policy='warn'):
 	:param formula: If specified, calculate the VIF only for the variables in the formula
 	:type formula: str
 	
-	:return: The variance inflation factor
-	:rtype: float
+	:return: The variance inflation factors
+	:rtype: Series
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'D': [68.58, 67.33, 67.33, ...],
+			'T1': [14, 10, 10, ...],
+			'T2': [46, 73, 85, ...],
+			...
+		})
+		yli.vif(df[['D', 'T1', 'T2', ...]])
+	
+	.. code-block:: text
+		
+		D     8.318301
+		T1    6.081590
+		T2    2.457122
+		...
+		dtype: float64
+	
+	The output shows the variance inflation factor for each variable in *df*.
 	"""
 	
 	if formula:
@ -444,6 +467,40 @@ def regress(
 	:type exp: bool
 	
 	:rtype: :class:`yli.regress.RegressionResult`
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Unhealthy': [False, False, False, ...],
+			'Fibrinogen': [2.52, 2.46, 2.29, ...],
+			'GammaGlobulin': [38, 36, 36, ...]
+		})
+		yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	
+	.. code-block:: text
+		
+		             Logistic Regression Results              
+		======================================================
+		Dep. Variable:  Unhealthy  |  No. Observations:     32
+		        Model:      Logit  |         Df. Model:      2
+		       Method:        MLE  |     Df. Residuals:     29
+		         Date: 2022-10-18  |         Pseudo R²:   0.26
+		         Time:   19:00:34  |          LL-Model: -11.47
+		  Std. Errors: Non-Robust  |           LL-Null: -15.44
+		                           |            p (LR):  0.02*
+		======================================================
+		                exp(β)   (95% CI)          p   
+		-----------------------------------------------
+		  (Intercept)     0.00 (0.00 -  0.24)    0.03* 
+		   Fibrinogen     6.80 (1.01 - 45.79)    0.049*
+		GammaGlobulin     1.17 (0.92 -  1.48)    0.19  
+		-----------------------------------------------
+	
+	The output summarises the results of the regression.
+	Note that the parameter estimates are automatically exponentiated.
+	For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
 	"""
 	
 	# Populate model_kwargs
@ -603,6 +660,37 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 	Uses the R *logistf* library.
 	
 	This class should be used in conjunction with :func:`yli.regress`.
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Pred': [1] * 20 + [0] * 220,
+			'Outcome': [1] * 40 + [0] * 200
+		})
+		yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
+	
+	.. code-block:: text
+		
+		          Penalised Logistic Regression Results          
+		=========================================================
+		Dep. Variable:      Outcome  |  No. Observations:     240
+		        Model:        Logit  |         Df. Model:       1
+		       Method: Penalised ML  |         Pseudo R²:    0.37
+		         Date:   2022-10-19  |          LL-Model:  -66.43
+		         Time:     07:50:40  |           LL-Null: -105.91
+		  Std. Errors:   Non-Robust  |            p (LR): <0.001*
+		=========================================================
+		                β      (95% CI)          p   
+		---------------------------------------------
+		(Intercept)   -2.28 (-2.77 - -1.85)   <0.001*
+		       Pred    5.99  (3.95 - 10.85)   <0.001*
+		---------------------------------------------
+	
+	The output summarises the result of the regression.
+	The summary table shows that a penalised method was applied.
+	Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
 	"""
 	
 	def fit(self, disp=False):
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -78,6 +78,24 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
 	:type nan_policy: str
 	
 	:rtype: :class:`yli.sig_tests.TTestResult`
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Type': ['Fresh'] * 10 + ['Stored'] * 10,
+			'Potency': [10.2, 10.5, 10.3, ...]
+		})
+		yli.ttest_ind(df, 'Potency', 'Type')
+	
+	.. code-block:: text
+		
+		t(18) = 4.24; p < 0.001*
+		Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
+	
+	The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
+	The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
 	"""
 	
 	# Check for/clean NaNs
@ -129,7 +147,7 @@ class FTestResult:
 		return super().__repr__()
 	
 	def _repr_html_(self):
-		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
+		return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
 	
 	def summary(self):
 		"""
@ -138,7 +156,7 @@ class FTestResult:
 		:rtype: str
 		"""
 		
-		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
+		return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))

 def anova_oneway(df, dep, ind, *, nan_policy='omit'):
 	"""
@ -154,6 +172,22 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'):
 	:type nan_policy: str
 	
 	:rtype: :class:`yli.sig_tests.FTestResult`
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Method': [1]*8 + [2]*7 + [3]*9,
+			'Score': [96, 79, 91, ...]
+		})
+		yli.anova_oneway(df, 'Score', 'Method')
+	
+	.. code-block:: text
+		
+		F(2, 21) = 29.57; p < 0.001*
+	
+	The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
 	"""
 	
 	# Check for/clean NaNs
@ -182,7 +216,7 @@ class MannWhitneyResult:
 	"""
 	
 	def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
-		#: Mann–Whitney *U* statistic (*float*)
+		#: Lesser of the two Mann–Whitney *U* statistics (*float*)
 		self.statistic = statistic
 		#: *p* value for the *U* statistic (*float*)
 		self.pvalue = pvalue
@ -212,7 +246,7 @@ class MannWhitneyResult:
 		:rtype: str
 		"""
 		
-		line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
+		line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
 		if self.brunnermunzel:
 			return line1 + '\n' + self.brunnermunzel.summary()
 		else:
@ -273,6 +307,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 	:param method: See *scipy.stats.mannwhitneyu*
 	
 	:rtype: :class:`yli.sig_tests.MannWhitneyResult`
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Sample': ['Before'] * 12 + ['After'] * 12,
+			'Oxygen': [11.0, 11.2, 11.2, ...]
+		})
+		yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
+	
+	.. code-block:: text
+		
+		U = 6.0; p < 0.001*
+		r = 0.92, Before > After
+	
+	The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
+	The rank-biserial correlation is 0.92 in favour of the *Before* group.
 	"""
 	
 	# Check for/clean NaNs
@ -352,10 +404,10 @@ class PearsonChiSquaredResult:
 		"""
 		
 		if self.oddsratio is not None:
-			return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
+			return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
 				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
 		else:
-			return '{}\nχ²({}) = {:.2f}; p {}'.format(
+			return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
 				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))

 def chi2(df, dep, ind, *, nan_policy='warn'):
@ -363,8 +415,8 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
 	Perform a Pearson *χ*:sup:`2` test
 	
 	If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
-	The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean).
-	The risk ratio is calculated relative to the independent variable.
+	The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
+	The risk ratio is calculated relative to the independent variable (rows of the contingency table).
 	
 	:param df: Data to perform the test on
 	:type df: DataFrame
@ -376,6 +428,33 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
 	:type nan_policy: str
 	
 	:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame({
+			'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
+			'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
+		})
+		yli.chi2(df, 'Stress', 'Response')
+	
+	.. code-block:: text
+		
+		Stress    False  True
+		Response             
+		False       250   400
+		True        750  1600
+		
+		χ²(1) = 9.82; p = 0.002*
+		OR (95% CI) = 1.33 (1.11–1.60)
+		RR (95% CI) = 1.11 (1.03–1.18)
+	
+	The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
+	
+	The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
+	
+	The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
 	"""
 	
 	# Check for/clean NaNs
@ -448,7 +527,7 @@ class PearsonRResult:

 def pearsonr(df, dep, ind, *, nan_policy='warn'):
 	"""
-	Compute the Pearson correlation coefficient (Pearson's r)
+	Compute the Pearson correlation coefficient (Pearson's *r*)
 	
 	:param df: Data to perform the test on
 	:type df: DataFrame