From 56e16bc71d12237c4b27b99a551dd70830b95b6f Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Sat, 3 Dec 2022 02:02:34 +1100 Subject: [PATCH] Fixups and update documentation --- docs/internal.rst | 27 ++++++++++++++++++++++++++- docs/regress.rst | 20 +++++++++++++------- yli/__init__.py | 17 +++++++++++------ yli/descriptives.py | 1 + yli/regress.py | 32 +++++++++++++++++--------------- yli/utils.py | 3 ++- 6 files changed, 70 insertions(+), 30 deletions(-) diff --git a/docs/internal.rst b/docs/internal.rst index 0774bac..ddaae53 100644 --- a/docs/internal.rst +++ b/docs/internal.rst @@ -5,6 +5,8 @@ Data wrangling -------------- .. autofunction:: yli.utils.as_2groups + +.. autofunction:: yli.utils.as_numeric .. autofunction:: yli.utils.convert_pandas_nullable @@ -31,7 +33,7 @@ Data wrangling .. attribute:: HTML - Format as HTML (i.e. escape ``<``) + Format as HTML (e.g. escape ``<``) Formula manipulation -------------------- @@ -41,3 +43,26 @@ Formula manipulation .. autofunction:: yli.utils.formula_factor_ref_category .. autofunction:: yli.utils.parse_patsy_term + +Library style +------------- + +For API nomenclature, the following guidelines are used: + +* Prefer to call a test by its specific name (e.g. *anova* rather than *ftest* where applicable), unless most commonly known only by the distribution of the test statistic (e.g. *chi2*, *ttest*). + +.. + +* A test/statistic is not referred to by both a distribution and specific name (e.g. *mannwhitney* rather than *mannwhitneyu*), unless required for disambiguation (e.g. *pearsonr* to distinguish the Pearson *χ*:sup:`2` test). + +.. + +* The word "test" is omitted (e.g. *chi2* rather than *chi2test*), unless the name would otherwise be a single letter (e.g. *ttest*, *ftest*), or unless required for disambiguation (e.g. *LikelihoodRatioTestResult* to distinguish from the unrelated meaning of "likelihood ratio" in epidemiology). + +.. + +* Underscores are usually omitted from the names of specific tests, test families and statistics (e.g. *ttest*, *oddsratio*, *pearsonr*, *pvalue*), but are used to separate these from other components (e.g. *ttest_ind*, *anova_oneway*, *lrtest_null*). There are a few exceptions (e.g. *rank_biserial*, *pseudo_rsquared*, *f_statistic*). + +.. + +* The result class for a test has the same naming convention as the test function (e.g. *TTestResult* for *ttest_ind*), with abbreviations spelled out (e.g. *PearsonChiSquaredResult*, *LikelihoodRatioTestResult*); unless the result class is generic among several tests (e.g. *FTestResult* for *anova_oneway* and *RegressionResult.ftest*), or unless required for disambiguation (e.g. *PearsonChiSquaredResult* for *chi2*, as there are other *χ*:sup:`2` tests). diff --git a/docs/regress.rst b/docs/regress.rst index 6c442d9..f5ab50e 100644 --- a/docs/regress.rst +++ b/docs/regress.rst @@ -6,23 +6,23 @@ Functions .. autofunction:: yli.logit_then_regress -.. autoclass:: yli.OrdinalLogit - -.. autoclass:: yli.PenalisedLogit - .. autofunction:: yli.regress .. autofunction:: yli.vif +Additional regression models +---------------------------- + +.. autoclass:: yli.OrdinalLogit + +.. autoclass:: yli.PenalisedLogit + Result classes -------------- .. autoclass:: yli.regress.BrantResult :members: -.. autoclass:: yli.regress.CategoricalTerm - :members: - .. autoclass:: yli.regress.LikelihoodRatioTestResult :members: :inherited-members: @@ -30,5 +30,11 @@ Result classes .. autoclass:: yli.regress.RegressionResult :members: +Model terms +----------- + +.. autoclass:: yli.regress.CategoricalTerm + :members: + .. autoclass:: yli.regress.SingleTerm :members: diff --git a/yli/__init__.py b/yli/__init__.py index 0b527bd..35e1f9d 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -26,9 +26,14 @@ def reload_me(): import importlib import sys - for k, v in list(sys.modules.items()): - if k == 'yli' or k.startswith('yli.'): - try: - importlib.reload(v) - except ModuleNotFoundError: - pass + for _ in range(2): # Do it twice to make sure imports are also reloaded fully + for k, v in list(sys.modules.items()): + if k == 'yli' or k.startswith('yli.'): + try: + importlib.reload(v) + except ModuleNotFoundError as ex: + if ex.name.startswith('yli.'): + # Must be due to a module which we deleted - can safely ignore + pass + else: + raise ex diff --git a/yli/descriptives.py b/yli/descriptives.py index eeb9a8e..f798dbe 100644 --- a/yli/descriptives.py +++ b/yli/descriptives.py @@ -51,6 +51,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): # Categorical data + # FIXME: Sort order values = sorted(data_cleaned.unique()) # Value counts diff --git a/yli/regress.py b/yli/regress.py index b4f6e61..ec79cc6 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -130,7 +130,7 @@ class RegressionResult: full_name, model_name, fit_method, nobs, dof_model, fitted_dt, cov_type, terms, - llf, llnull, + ll_model, ll_null, dof_resid, rsquared, f_statistic, comments, exp @@ -178,9 +178,9 @@ class RegressionResult: # Model log-likelihood #: Log-likelihood of fitted model (*float*) - self.llf = llf + self.ll_model = ll_model #: Log-likelihood of null model (*float*) - self.llnull = llnull + self.ll_null = ll_null # Extra statistics (not all regression models have these) #: Degrees of freedom for the residuals (*int*; *None* if N/A) @@ -201,7 +201,7 @@ class RegressionResult: def pseudo_rsquared(self): """McFadden's pseudo *R*:sup:`2` statistic""" - return 1 - self.llf/self.llnull + return 1 - self.ll_model/self.ll_null def lrtest_null(self): """ @@ -210,7 +210,7 @@ class RegressionResult: :rtype: :class:`LikelihoodRatioTestResult` """ - statistic = -2 * (self.llnull - self.llf) + statistic = -2 * (self.ll_null - self.ll_model) pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) @@ -308,7 +308,7 @@ class RegressionResult: # Fit individual logistic regressions logit_models = [] - for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order + for upper_limit in sorted(df[dep].unique())[:-1]: dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) @@ -454,7 +454,7 @@ class RegressionResult: self.full_name, self.model_name, self.fit_method, self.nobs, self.dof_model, datetime.now(), 'Bootstrap', terms, - self.llf, self.llnull, + self.ll_model, self.ll_null, self.dof_resid, self.rsquared, self.f_statistic, self.comments, self.exp @@ -499,8 +499,8 @@ class RegressionResult: # Otherwise report likelihood ratio test as overall test lrtest_result = self.lrtest_null() - right_col.append(('LL-Model:', format(self.llf, '.2f'))) - right_col.append(('LL-Null:', format(self.llnull, '.2f'))) + right_col.append(('LL-Model:', format(self.ll_model, '.2f'))) + right_col.append(('LL-Null:', format(self.ll_null, '.2f'))) if html: right_col.append(('p (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML))) else: @@ -859,9 +859,11 @@ def regress( # Single term terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name]) - # Fit null model (for llnull) - if hasattr(result, 'llnull'): - llnull = result.llnull + # Fit null model (for ll_null) + if hasattr(result, 'll_null'): + ll_null = result.ll_null + elif hasattr(result, 'llnull'): + ll_null = result.llnull else: # Construct null (intercept-only) model #result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit() @@ -870,7 +872,7 @@ def regress( dm_exog['Intercept'].fillna(1, inplace=True) result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit() - llnull = result_null.llf + ll_null = result_null.llf if model_class is sm.OLS: method_name = 'Least Squares' @@ -897,7 +899,7 @@ def regress( full_name, model_class.__name__, method_name, result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), terms, - result.llf, llnull, + result.llf, ll_null, getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), [], exp @@ -1030,7 +1032,7 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel): statsmodels-compatible model for computing ordinal logistic (or probit) regression The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata. - The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs. + In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs. In this parameterisation, cutoff terms are represented directly in the model. **Example:** diff --git a/yli/utils.py b/yli/utils.py index 9641ede..1e0da9d 100644 --- a/yli/utils.py +++ b/yli/utils.py @@ -109,6 +109,7 @@ def as_2groups(df, data, group): raise Exception('Got {} values for {}, expected 2'.format(len(groups), group)) # Get 2 groups + # FIXME: Sort order group1 = groups[0][0] data1 = df.loc[groups[0][1], data] group2 = groups[1][0] @@ -118,7 +119,7 @@ def as_2groups(df, data, group): def as_numeric(data): """ - Convert the given data to a numeric type, factorising if required + Convert the data to a numeric type, factorising if required :param data: Data to convert :type df: Series