Fixups and update documentation

This commit is contained in:
RunasSudo 2022-12-03 02:02:34 +11:00
parent bbb998003e
commit 56e16bc71d
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
6 changed files with 70 additions and 30 deletions

View File

@ -6,6 +6,8 @@ Data wrangling
.. autofunction:: yli.utils.as_2groups
.. autofunction:: yli.utils.as_numeric
.. autofunction:: yli.utils.convert_pandas_nullable
*p* values
@ -31,7 +33,7 @@ Data wrangling
.. attribute:: HTML
Format as HTML (i.e. escape ``<``)
Format as HTML (e.g. escape ``<``)
Formula manipulation
--------------------
@ -41,3 +43,26 @@ Formula manipulation
.. autofunction:: yli.utils.formula_factor_ref_category
.. autofunction:: yli.utils.parse_patsy_term
Library style
-------------
For API nomenclature, the following guidelines are used:
* Prefer to call a test by its specific name (e.g. *anova* rather than *ftest* where applicable), unless most commonly known only by the distribution of the test statistic (e.g. *chi2*, *ttest*).
..
* A test/statistic is not referred to by both a distribution and specific name (e.g. *mannwhitney* rather than *mannwhitneyu*), unless required for disambiguation (e.g. *pearsonr* to distinguish the Pearson *χ*:sup:`2` test).
..
* The word "test" is omitted (e.g. *chi2* rather than *chi2test*), unless the name would otherwise be a single letter (e.g. *ttest*, *ftest*), or unless required for disambiguation (e.g. *LikelihoodRatioTestResult* to distinguish from the unrelated meaning of "likelihood ratio" in epidemiology).
..
* Underscores are usually omitted from the names of specific tests, test families and statistics (e.g. *ttest*, *oddsratio*, *pearsonr*, *pvalue*), but are used to separate these from other components (e.g. *ttest_ind*, *anova_oneway*, *lrtest_null*). There are a few exceptions (e.g. *rank_biserial*, *pseudo_rsquared*, *f_statistic*).
..
* The result class for a test has the same naming convention as the test function (e.g. *TTestResult* for *ttest_ind*), with abbreviations spelled out (e.g. *PearsonChiSquaredResult*, *LikelihoodRatioTestResult*); unless the result class is generic among several tests (e.g. *FTestResult* for *anova_oneway* and *RegressionResult.ftest*), or unless required for disambiguation (e.g. *PearsonChiSquaredResult* for *chi2*, as there are other *χ*:sup:`2` tests).

View File

@ -6,23 +6,23 @@ Functions
.. autofunction:: yli.logit_then_regress
.. autoclass:: yli.OrdinalLogit
.. autoclass:: yli.PenalisedLogit
.. autofunction:: yli.regress
.. autofunction:: yli.vif
Additional regression models
----------------------------
.. autoclass:: yli.OrdinalLogit
.. autoclass:: yli.PenalisedLogit
Result classes
--------------
.. autoclass:: yli.regress.BrantResult
:members:
.. autoclass:: yli.regress.CategoricalTerm
:members:
.. autoclass:: yli.regress.LikelihoodRatioTestResult
:members:
:inherited-members:
@ -30,5 +30,11 @@ Result classes
.. autoclass:: yli.regress.RegressionResult
:members:
Model terms
-----------
.. autoclass:: yli.regress.CategoricalTerm
:members:
.. autoclass:: yli.regress.SingleTerm
:members:

View File

@ -26,9 +26,14 @@ def reload_me():
import importlib
import sys
for _ in range(2): # Do it twice to make sure imports are also reloaded fully
for k, v in list(sys.modules.items()):
if k == 'yli' or k.startswith('yli.'):
try:
importlib.reload(v)
except ModuleNotFoundError:
except ModuleNotFoundError as ex:
if ex.name.startswith('yli.'):
# Must be due to a module which we deleted - can safely ignore
pass
else:
raise ex

View File

@ -51,6 +51,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# FIXME: Sort order
values = sorted(data_cleaned.unique())
# Value counts

View File

@ -130,7 +130,7 @@ class RegressionResult:
full_name, model_name, fit_method,
nobs, dof_model, fitted_dt, cov_type,
terms,
llf, llnull,
ll_model, ll_null,
dof_resid, rsquared, f_statistic,
comments,
exp
@ -178,9 +178,9 @@ class RegressionResult:
# Model log-likelihood
#: Log-likelihood of fitted model (*float*)
self.llf = llf
self.ll_model = ll_model
#: Log-likelihood of null model (*float*)
self.llnull = llnull
self.ll_null = ll_null
# Extra statistics (not all regression models have these)
#: Degrees of freedom for the residuals (*int*; *None* if N/A)
@ -201,7 +201,7 @@ class RegressionResult:
def pseudo_rsquared(self):
"""McFadden's pseudo *R*:sup:`2` statistic"""
return 1 - self.llf/self.llnull
return 1 - self.ll_model/self.ll_null
def lrtest_null(self):
"""
@ -210,7 +210,7 @@ class RegressionResult:
:rtype: :class:`LikelihoodRatioTestResult`
"""
statistic = -2 * (self.llnull - self.llf)
statistic = -2 * (self.ll_null - self.ll_model)
pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
@ -308,7 +308,7 @@ class RegressionResult:
# Fit individual logistic regressions
logit_models = []
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
for upper_limit in sorted(df[dep].unique())[:-1]:
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
@ -454,7 +454,7 @@ class RegressionResult:
self.full_name, self.model_name, self.fit_method,
self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
terms,
self.llf, self.llnull,
self.ll_model, self.ll_null,
self.dof_resid, self.rsquared, self.f_statistic,
self.comments,
self.exp
@ -499,8 +499,8 @@ class RegressionResult:
# Otherwise report likelihood ratio test as overall test
lrtest_result = self.lrtest_null()
right_col.append(('LL-Model:', format(self.llf, '.2f')))
right_col.append(('LL-Null:', format(self.llnull, '.2f')))
right_col.append(('LL-Model:', format(self.ll_model, '.2f')))
right_col.append(('LL-Null:', format(self.ll_null, '.2f')))
if html:
right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML)))
else:
@ -859,9 +859,11 @@ def regress(
# Single term
terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
# Fit null model (for llnull)
if hasattr(result, 'llnull'):
llnull = result.llnull
# Fit null model (for ll_null)
if hasattr(result, 'll_null'):
ll_null = result.ll_null
elif hasattr(result, 'llnull'):
ll_null = result.llnull
else:
# Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
@ -870,7 +872,7 @@ def regress(
dm_exog['Intercept'].fillna(1, inplace=True)
result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
llnull = result_null.llf
ll_null = result_null.llf
if model_class is sm.OLS:
method_name = 'Least Squares'
@ -897,7 +899,7 @@ def regress(
full_name, model_class.__name__, method_name,
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
terms,
result.llf, llnull,
result.llf, ll_null,
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
[],
exp
@ -1030,7 +1032,7 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
statsmodels-compatible model for computing ordinal logistic (or probit) regression
The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs.
In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs.
In this parameterisation, cutoff terms are represented directly in the model.
**Example:**

View File

@ -109,6 +109,7 @@ def as_2groups(df, data, group):
raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
# Get 2 groups
# FIXME: Sort order
group1 = groups[0][0]
data1 = df.loc[groups[0][1], data]
group2 = groups[1][0]
@ -118,7 +119,7 @@ def as_2groups(df, data, group):
def as_numeric(data):
"""
Convert the given data to a numeric type, factorising if required
Convert the data to a numeric type, factorising if required
:param data: Data to convert
:type df: Series