Fixups and update documentation

2022-12-03 02:02:34 +11:00 · 2022-12-03 02:02:34 +11:00 · 56e16bc71d
commit 56e16bc71d
parent bbb998003e
6 changed files with 70 additions and 30 deletions
--- a/docs/internal.rst
+++ b/docs/internal.rst
@ -5,6 +5,8 @@ Data wrangling
 --------------

 .. autofunction:: yli.utils.as_2groups
+	
+.. autofunction:: yli.utils.as_numeric

 .. autofunction:: yli.utils.convert_pandas_nullable

@ -31,7 +33,7 @@ Data wrangling
 	
 	.. attribute:: HTML
 		
-		Format as HTML (i.e. escape ``<``)
+		Format as HTML (e.g. escape ``<``)

 Formula manipulation
 --------------------
@ -41,3 +43,26 @@ Formula manipulation
 .. autofunction:: yli.utils.formula_factor_ref_category

 .. autofunction:: yli.utils.parse_patsy_term
+
+Library style
+-------------
+
+For API nomenclature, the following guidelines are used:
+
+* Prefer to call a test by its specific name (e.g. *anova* rather than *ftest* where applicable), unless most commonly known only by the distribution of the test statistic (e.g. *chi2*, *ttest*).
+
+..
+
+* A test/statistic is not referred to by both a distribution and specific name (e.g. *mannwhitney* rather than *mannwhitneyu*), unless required for disambiguation (e.g. *pearsonr* to distinguish the Pearson *χ*:sup:`2` test).
+
+..
+
+* The word "test" is omitted (e.g. *chi2* rather than *chi2test*), unless the name would otherwise be a single letter (e.g. *ttest*, *ftest*), or unless required for disambiguation (e.g. *LikelihoodRatioTestResult* to distinguish from the unrelated meaning of "likelihood ratio" in epidemiology).
+
+..
+
+* Underscores are usually omitted from the names of specific tests, test families and statistics (e.g. *ttest*, *oddsratio*, *pearsonr*, *pvalue*), but are used to separate these from other components (e.g. *ttest_ind*, *anova_oneway*, *lrtest_null*). There are a few exceptions (e.g. *rank_biserial*, *pseudo_rsquared*, *f_statistic*).
+
+..
+
+* The result class for a test has the same naming convention as the test function (e.g. *TTestResult* for *ttest_ind*), with abbreviations spelled out (e.g. *PearsonChiSquaredResult*, *LikelihoodRatioTestResult*); unless the result class is generic among several tests (e.g. *FTestResult* for *anova_oneway* and *RegressionResult.ftest*), or unless required for disambiguation (e.g. *PearsonChiSquaredResult* for *chi2*, as there are other *χ*:sup:`2` tests).
--- a/docs/regress.rst
+++ b/docs/regress.rst
@ -6,23 +6,23 @@ Functions

 .. autofunction:: yli.logit_then_regress

-.. autoclass:: yli.OrdinalLogit
-
-.. autoclass:: yli.PenalisedLogit
-
 .. autofunction:: yli.regress

 .. autofunction:: yli.vif

+Additional regression models
+----------------------------
+
+.. autoclass:: yli.OrdinalLogit
+
+.. autoclass:: yli.PenalisedLogit
+
 Result classes
 --------------

 .. autoclass:: yli.regress.BrantResult
 	:members:

-.. autoclass:: yli.regress.CategoricalTerm
-	:members:
-
 .. autoclass:: yli.regress.LikelihoodRatioTestResult
 	:members:
 	:inherited-members:
@ -30,5 +30,11 @@ Result classes
 .. autoclass:: yli.regress.RegressionResult
 	:members:

+Model terms
+-----------
+
+.. autoclass:: yli.regress.CategoricalTerm
+	:members:
+
 .. autoclass:: yli.regress.SingleTerm
 	:members:
--- a/yli/init.py
+++ b/yli/init.py
@ -26,9 +26,14 @@ def reload_me():
 	import importlib
 	import sys
 	
-	for k, v in list(sys.modules.items()):
-		if k == 'yli' or k.startswith('yli.'):
-			try:
-				importlib.reload(v)
-			except ModuleNotFoundError:
-				pass
+	for _ in range(2):  # Do it twice to make sure imports are also reloaded fully
+		for k, v in list(sys.modules.items()):
+			if k == 'yli' or k.startswith('yli.'):
+				try:
+					importlib.reload(v)
+				except ModuleNotFoundError as ex:
+					if ex.name.startswith('yli.'):
+						# Must be due to a module which we deleted - can safely ignore
+						pass
+					else:
+						raise ex
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -51,6 +51,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
 		
 		if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
 			# Categorical data
+			# FIXME: Sort order
 			values = sorted(data_cleaned.unique())
 			
 			# Value counts
--- a/yli/regress.py
+++ b/yli/regress.py
@ -130,7 +130,7 @@ class RegressionResult:
 		full_name, model_name, fit_method,
 		nobs, dof_model, fitted_dt, cov_type,
 		terms,
-		llf, llnull,
+		ll_model, ll_null,
 		dof_resid, rsquared, f_statistic,
 		comments,
 		exp
@ -178,9 +178,9 @@ class RegressionResult:
 		
 		# Model log-likelihood
 		#: Log-likelihood of fitted model (*float*)
-		self.llf = llf
+		self.ll_model = ll_model
 		#: Log-likelihood of null model (*float*)
-		self.llnull = llnull
+		self.ll_null = ll_null
 		
 		# Extra statistics (not all regression models have these)
 		#: Degrees of freedom for the residuals (*int*; *None* if N/A)
@ -201,7 +201,7 @@ class RegressionResult:
 	def pseudo_rsquared(self):
 		"""McFadden's pseudo *R*:sup:`2` statistic"""
 		
-		return 1 - self.llf/self.llnull
+		return 1 - self.ll_model/self.ll_null
 	
 	def lrtest_null(self):
 		"""
@ -210,7 +210,7 @@ class RegressionResult:
 		:rtype: :class:`LikelihoodRatioTestResult`
 		"""
 		
-		statistic = -2 * (self.llnull - self.llf)
+		statistic = -2 * (self.ll_null - self.ll_model)
 		pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
 		
 		return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
@ -308,7 +308,7 @@ class RegressionResult:
 		
 		# Fit individual logistic regressions
 		logit_models = []
-		for upper_limit in sorted(df[dep].unique())[:-1]:  # FIXME: Sort order
+		for upper_limit in sorted(df[dep].unique())[:-1]:
 			dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
 			logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
 			
@ -454,7 +454,7 @@ class RegressionResult:
 			self.full_name, self.model_name, self.fit_method,
 			self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
 			terms,
-			self.llf, self.llnull,
+			self.ll_model, self.ll_null,
 			self.dof_resid, self.rsquared, self.f_statistic,
 			self.comments,
 			self.exp
@ -499,8 +499,8 @@ class RegressionResult:
 			# Otherwise report likelihood ratio test as overall test
 			lrtest_result = self.lrtest_null()
 			
-			right_col.append(('LL-Model:', format(self.llf, '.2f')))
-			right_col.append(('LL-Null:', format(self.llnull, '.2f')))
+			right_col.append(('LL-Model:', format(self.ll_model, '.2f')))
+			right_col.append(('LL-Null:', format(self.ll_null, '.2f')))
 			if html:
 				right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML)))
 			else:
@ -859,9 +859,11 @@ def regress(
 				# Single term
 				terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
 	
-	# Fit null model (for llnull)
-	if hasattr(result, 'llnull'):
-		llnull = result.llnull
+	# Fit null model (for ll_null)
+	if hasattr(result, 'll_null'):
+		ll_null = result.ll_null
+	elif hasattr(result, 'llnull'):
+		ll_null = result.llnull
 	else:
 		# Construct null (intercept-only) model
 		#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
@ -870,7 +872,7 @@ def regress(
 		dm_exog['Intercept'].fillna(1, inplace=True)
 		
 		result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
-		llnull = result_null.llf
+		ll_null = result_null.llf
 	
 	if model_class is sm.OLS:
 		method_name = 'Least Squares'
@ -897,7 +899,7 @@ def regress(
 		full_name, model_class.__name__, method_name,
 		result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
 		terms,
-		result.llf, llnull,
+		result.llf, ll_null,
 		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
 		[],
 		exp
@ -1030,7 +1032,7 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
 	statsmodels-compatible model for computing ordinal logistic (or probit) regression
 	
 	The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
-	The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs.
+	In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs.
 	In this parameterisation, cutoff terms are represented directly in the model.
 	
 	**Example:**
--- a/yli/utils.py
+++ b/yli/utils.py
@ -109,6 +109,7 @@ def as_2groups(df, data, group):
 		raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
 	
 	# Get 2 groups
+	# FIXME: Sort order
 	group1 = groups[0][0]
 	data1 = df.loc[groups[0][1], data]
 	group2 = groups[1][0]
@ -118,7 +119,7 @@ def as_2groups(df, data, group):

 def as_numeric(data):
 	"""
-	Convert the given data to a numeric type, factorising if required
+	Convert the data to a numeric type, factorising if required
 	
 	:param data: Data to convert
 	:type df: Series