Fixups and update documentation

This commit is contained in:
RunasSudo 2022-12-03 02:02:34 +11:00
parent bbb998003e
commit 56e16bc71d
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
6 changed files with 70 additions and 30 deletions

View File

@ -5,6 +5,8 @@ Data wrangling
-------------- --------------
.. autofunction:: yli.utils.as_2groups .. autofunction:: yli.utils.as_2groups
.. autofunction:: yli.utils.as_numeric
.. autofunction:: yli.utils.convert_pandas_nullable .. autofunction:: yli.utils.convert_pandas_nullable
@ -31,7 +33,7 @@ Data wrangling
.. attribute:: HTML .. attribute:: HTML
Format as HTML (i.e. escape ``<``) Format as HTML (e.g. escape ``<``)
Formula manipulation Formula manipulation
-------------------- --------------------
@ -41,3 +43,26 @@ Formula manipulation
.. autofunction:: yli.utils.formula_factor_ref_category .. autofunction:: yli.utils.formula_factor_ref_category
.. autofunction:: yli.utils.parse_patsy_term .. autofunction:: yli.utils.parse_patsy_term
Library style
-------------
For API nomenclature, the following guidelines are used:
* Prefer to call a test by its specific name (e.g. *anova* rather than *ftest* where applicable), unless most commonly known only by the distribution of the test statistic (e.g. *chi2*, *ttest*).
..
* A test/statistic is not referred to by both a distribution and specific name (e.g. *mannwhitney* rather than *mannwhitneyu*), unless required for disambiguation (e.g. *pearsonr* to distinguish the Pearson *χ*:sup:`2` test).
..
* The word "test" is omitted (e.g. *chi2* rather than *chi2test*), unless the name would otherwise be a single letter (e.g. *ttest*, *ftest*), or unless required for disambiguation (e.g. *LikelihoodRatioTestResult* to distinguish from the unrelated meaning of "likelihood ratio" in epidemiology).
..
* Underscores are usually omitted from the names of specific tests, test families and statistics (e.g. *ttest*, *oddsratio*, *pearsonr*, *pvalue*), but are used to separate these from other components (e.g. *ttest_ind*, *anova_oneway*, *lrtest_null*). There are a few exceptions (e.g. *rank_biserial*, *pseudo_rsquared*, *f_statistic*).
..
* The result class for a test has the same naming convention as the test function (e.g. *TTestResult* for *ttest_ind*), with abbreviations spelled out (e.g. *PearsonChiSquaredResult*, *LikelihoodRatioTestResult*); unless the result class is generic among several tests (e.g. *FTestResult* for *anova_oneway* and *RegressionResult.ftest*), or unless required for disambiguation (e.g. *PearsonChiSquaredResult* for *chi2*, as there are other *χ*:sup:`2` tests).

View File

@ -6,23 +6,23 @@ Functions
.. autofunction:: yli.logit_then_regress .. autofunction:: yli.logit_then_regress
.. autoclass:: yli.OrdinalLogit
.. autoclass:: yli.PenalisedLogit
.. autofunction:: yli.regress .. autofunction:: yli.regress
.. autofunction:: yli.vif .. autofunction:: yli.vif
Additional regression models
----------------------------
.. autoclass:: yli.OrdinalLogit
.. autoclass:: yli.PenalisedLogit
Result classes Result classes
-------------- --------------
.. autoclass:: yli.regress.BrantResult .. autoclass:: yli.regress.BrantResult
:members: :members:
.. autoclass:: yli.regress.CategoricalTerm
:members:
.. autoclass:: yli.regress.LikelihoodRatioTestResult .. autoclass:: yli.regress.LikelihoodRatioTestResult
:members: :members:
:inherited-members: :inherited-members:
@ -30,5 +30,11 @@ Result classes
.. autoclass:: yli.regress.RegressionResult .. autoclass:: yli.regress.RegressionResult
:members: :members:
Model terms
-----------
.. autoclass:: yli.regress.CategoricalTerm
:members:
.. autoclass:: yli.regress.SingleTerm .. autoclass:: yli.regress.SingleTerm
:members: :members:

View File

@ -26,9 +26,14 @@ def reload_me():
import importlib import importlib
import sys import sys
for k, v in list(sys.modules.items()): for _ in range(2): # Do it twice to make sure imports are also reloaded fully
if k == 'yli' or k.startswith('yli.'): for k, v in list(sys.modules.items()):
try: if k == 'yli' or k.startswith('yli.'):
importlib.reload(v) try:
except ModuleNotFoundError: importlib.reload(v)
pass except ModuleNotFoundError as ex:
if ex.name.startswith('yli.'):
# Must be due to a module which we deleted - can safely ignore
pass
else:
raise ex

View File

@ -51,6 +51,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data # Categorical data
# FIXME: Sort order
values = sorted(data_cleaned.unique()) values = sorted(data_cleaned.unique())
# Value counts # Value counts

View File

@ -130,7 +130,7 @@ class RegressionResult:
full_name, model_name, fit_method, full_name, model_name, fit_method,
nobs, dof_model, fitted_dt, cov_type, nobs, dof_model, fitted_dt, cov_type,
terms, terms,
llf, llnull, ll_model, ll_null,
dof_resid, rsquared, f_statistic, dof_resid, rsquared, f_statistic,
comments, comments,
exp exp
@ -178,9 +178,9 @@ class RegressionResult:
# Model log-likelihood # Model log-likelihood
#: Log-likelihood of fitted model (*float*) #: Log-likelihood of fitted model (*float*)
self.llf = llf self.ll_model = ll_model
#: Log-likelihood of null model (*float*) #: Log-likelihood of null model (*float*)
self.llnull = llnull self.ll_null = ll_null
# Extra statistics (not all regression models have these) # Extra statistics (not all regression models have these)
#: Degrees of freedom for the residuals (*int*; *None* if N/A) #: Degrees of freedom for the residuals (*int*; *None* if N/A)
@ -201,7 +201,7 @@ class RegressionResult:
def pseudo_rsquared(self): def pseudo_rsquared(self):
"""McFadden's pseudo *R*:sup:`2` statistic""" """McFadden's pseudo *R*:sup:`2` statistic"""
return 1 - self.llf/self.llnull return 1 - self.ll_model/self.ll_null
def lrtest_null(self): def lrtest_null(self):
""" """
@ -210,7 +210,7 @@ class RegressionResult:
:rtype: :class:`LikelihoodRatioTestResult` :rtype: :class:`LikelihoodRatioTestResult`
""" """
statistic = -2 * (self.llnull - self.llf) statistic = -2 * (self.ll_null - self.ll_model)
pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
@ -308,7 +308,7 @@ class RegressionResult:
# Fit individual logistic regressions # Fit individual logistic regressions
logit_models = [] logit_models = []
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order for upper_limit in sorted(df[dep].unique())[:-1]:
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
@ -454,7 +454,7 @@ class RegressionResult:
self.full_name, self.model_name, self.fit_method, self.full_name, self.model_name, self.fit_method,
self.nobs, self.dof_model, datetime.now(), 'Bootstrap', self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
terms, terms,
self.llf, self.llnull, self.ll_model, self.ll_null,
self.dof_resid, self.rsquared, self.f_statistic, self.dof_resid, self.rsquared, self.f_statistic,
self.comments, self.comments,
self.exp self.exp
@ -499,8 +499,8 @@ class RegressionResult:
# Otherwise report likelihood ratio test as overall test # Otherwise report likelihood ratio test as overall test
lrtest_result = self.lrtest_null() lrtest_result = self.lrtest_null()
right_col.append(('LL-Model:', format(self.llf, '.2f'))) right_col.append(('LL-Model:', format(self.ll_model, '.2f')))
right_col.append(('LL-Null:', format(self.llnull, '.2f'))) right_col.append(('LL-Null:', format(self.ll_null, '.2f')))
if html: if html:
right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML))) right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML)))
else: else:
@ -859,9 +859,11 @@ def regress(
# Single term # Single term
terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name]) terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
# Fit null model (for llnull) # Fit null model (for ll_null)
if hasattr(result, 'llnull'): if hasattr(result, 'll_null'):
llnull = result.llnull ll_null = result.ll_null
elif hasattr(result, 'llnull'):
ll_null = result.llnull
else: else:
# Construct null (intercept-only) model # Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit() #result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
@ -870,7 +872,7 @@ def regress(
dm_exog['Intercept'].fillna(1, inplace=True) dm_exog['Intercept'].fillna(1, inplace=True)
result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit() result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
llnull = result_null.llf ll_null = result_null.llf
if model_class is sm.OLS: if model_class is sm.OLS:
method_name = 'Least Squares' method_name = 'Least Squares'
@ -897,7 +899,7 @@ def regress(
full_name, model_class.__name__, method_name, full_name, model_class.__name__, method_name,
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
terms, terms,
result.llf, llnull, result.llf, ll_null,
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
[], [],
exp exp
@ -1030,7 +1032,7 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
statsmodels-compatible model for computing ordinal logistic (or probit) regression statsmodels-compatible model for computing ordinal logistic (or probit) regression
The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata. The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs. In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs.
In this parameterisation, cutoff terms are represented directly in the model. In this parameterisation, cutoff terms are represented directly in the model.
**Example:** **Example:**

View File

@ -109,6 +109,7 @@ def as_2groups(df, data, group):
raise Exception('Got {} values for {}, expected 2'.format(len(groups), group)) raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
# Get 2 groups # Get 2 groups
# FIXME: Sort order
group1 = groups[0][0] group1 = groups[0][0]
data1 = df.loc[groups[0][1], data] data1 = df.loc[groups[0][1], data]
group2 = groups[1][0] group2 = groups[1][0]
@ -118,7 +119,7 @@ def as_2groups(df, data, group):
def as_numeric(data): def as_numeric(data):
""" """
Convert the given data to a numeric type, factorising if required Convert the data to a numeric type, factorising if required
:param data: Data to convert :param data: Data to convert
:type df: Series :type df: Series