Fixups and update documentation
This commit is contained in:
parent
bbb998003e
commit
56e16bc71d
@ -5,6 +5,8 @@ Data wrangling
|
||||
--------------
|
||||
|
||||
.. autofunction:: yli.utils.as_2groups
|
||||
|
||||
.. autofunction:: yli.utils.as_numeric
|
||||
|
||||
.. autofunction:: yli.utils.convert_pandas_nullable
|
||||
|
||||
@ -31,7 +33,7 @@ Data wrangling
|
||||
|
||||
.. attribute:: HTML
|
||||
|
||||
Format as HTML (i.e. escape ``<``)
|
||||
Format as HTML (e.g. escape ``<``)
|
||||
|
||||
Formula manipulation
|
||||
--------------------
|
||||
@ -41,3 +43,26 @@ Formula manipulation
|
||||
.. autofunction:: yli.utils.formula_factor_ref_category
|
||||
|
||||
.. autofunction:: yli.utils.parse_patsy_term
|
||||
|
||||
Library style
|
||||
-------------
|
||||
|
||||
For API nomenclature, the following guidelines are used:
|
||||
|
||||
* Prefer to call a test by its specific name (e.g. *anova* rather than *ftest* where applicable), unless most commonly known only by the distribution of the test statistic (e.g. *chi2*, *ttest*).
|
||||
|
||||
..
|
||||
|
||||
* A test/statistic is not referred to by both a distribution and specific name (e.g. *mannwhitney* rather than *mannwhitneyu*), unless required for disambiguation (e.g. *pearsonr* to distinguish the Pearson *χ*:sup:`2` test).
|
||||
|
||||
..
|
||||
|
||||
* The word "test" is omitted (e.g. *chi2* rather than *chi2test*), unless the name would otherwise be a single letter (e.g. *ttest*, *ftest*), or unless required for disambiguation (e.g. *LikelihoodRatioTestResult* to distinguish from the unrelated meaning of "likelihood ratio" in epidemiology).
|
||||
|
||||
..
|
||||
|
||||
* Underscores are usually omitted from the names of specific tests, test families and statistics (e.g. *ttest*, *oddsratio*, *pearsonr*, *pvalue*), but are used to separate these from other components (e.g. *ttest_ind*, *anova_oneway*, *lrtest_null*). There are a few exceptions (e.g. *rank_biserial*, *pseudo_rsquared*, *f_statistic*).
|
||||
|
||||
..
|
||||
|
||||
* The result class for a test has the same naming convention as the test function (e.g. *TTestResult* for *ttest_ind*), with abbreviations spelled out (e.g. *PearsonChiSquaredResult*, *LikelihoodRatioTestResult*); unless the result class is generic among several tests (e.g. *FTestResult* for *anova_oneway* and *RegressionResult.ftest*), or unless required for disambiguation (e.g. *PearsonChiSquaredResult* for *chi2*, as there are other *χ*:sup:`2` tests).
|
||||
|
@ -6,23 +6,23 @@ Functions
|
||||
|
||||
.. autofunction:: yli.logit_then_regress
|
||||
|
||||
.. autoclass:: yli.OrdinalLogit
|
||||
|
||||
.. autoclass:: yli.PenalisedLogit
|
||||
|
||||
.. autofunction:: yli.regress
|
||||
|
||||
.. autofunction:: yli.vif
|
||||
|
||||
Additional regression models
|
||||
----------------------------
|
||||
|
||||
.. autoclass:: yli.OrdinalLogit
|
||||
|
||||
.. autoclass:: yli.PenalisedLogit
|
||||
|
||||
Result classes
|
||||
--------------
|
||||
|
||||
.. autoclass:: yli.regress.BrantResult
|
||||
:members:
|
||||
|
||||
.. autoclass:: yli.regress.CategoricalTerm
|
||||
:members:
|
||||
|
||||
.. autoclass:: yli.regress.LikelihoodRatioTestResult
|
||||
:members:
|
||||
:inherited-members:
|
||||
@ -30,5 +30,11 @@ Result classes
|
||||
.. autoclass:: yli.regress.RegressionResult
|
||||
:members:
|
||||
|
||||
Model terms
|
||||
-----------
|
||||
|
||||
.. autoclass:: yli.regress.CategoricalTerm
|
||||
:members:
|
||||
|
||||
.. autoclass:: yli.regress.SingleTerm
|
||||
:members:
|
||||
|
@ -26,9 +26,14 @@ def reload_me():
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
for k, v in list(sys.modules.items()):
|
||||
if k == 'yli' or k.startswith('yli.'):
|
||||
try:
|
||||
importlib.reload(v)
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
for _ in range(2): # Do it twice to make sure imports are also reloaded fully
|
||||
for k, v in list(sys.modules.items()):
|
||||
if k == 'yli' or k.startswith('yli.'):
|
||||
try:
|
||||
importlib.reload(v)
|
||||
except ModuleNotFoundError as ex:
|
||||
if ex.name.startswith('yli.'):
|
||||
# Must be due to a module which we deleted - can safely ignore
|
||||
pass
|
||||
else:
|
||||
raise ex
|
||||
|
@ -51,6 +51,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
||||
|
||||
if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
|
||||
# Categorical data
|
||||
# FIXME: Sort order
|
||||
values = sorted(data_cleaned.unique())
|
||||
|
||||
# Value counts
|
||||
|
@ -130,7 +130,7 @@ class RegressionResult:
|
||||
full_name, model_name, fit_method,
|
||||
nobs, dof_model, fitted_dt, cov_type,
|
||||
terms,
|
||||
llf, llnull,
|
||||
ll_model, ll_null,
|
||||
dof_resid, rsquared, f_statistic,
|
||||
comments,
|
||||
exp
|
||||
@ -178,9 +178,9 @@ class RegressionResult:
|
||||
|
||||
# Model log-likelihood
|
||||
#: Log-likelihood of fitted model (*float*)
|
||||
self.llf = llf
|
||||
self.ll_model = ll_model
|
||||
#: Log-likelihood of null model (*float*)
|
||||
self.llnull = llnull
|
||||
self.ll_null = ll_null
|
||||
|
||||
# Extra statistics (not all regression models have these)
|
||||
#: Degrees of freedom for the residuals (*int*; *None* if N/A)
|
||||
@ -201,7 +201,7 @@ class RegressionResult:
|
||||
def pseudo_rsquared(self):
|
||||
"""McFadden's pseudo *R*:sup:`2` statistic"""
|
||||
|
||||
return 1 - self.llf/self.llnull
|
||||
return 1 - self.ll_model/self.ll_null
|
||||
|
||||
def lrtest_null(self):
|
||||
"""
|
||||
@ -210,7 +210,7 @@ class RegressionResult:
|
||||
:rtype: :class:`LikelihoodRatioTestResult`
|
||||
"""
|
||||
|
||||
statistic = -2 * (self.llnull - self.llf)
|
||||
statistic = -2 * (self.ll_null - self.ll_model)
|
||||
pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
|
||||
|
||||
return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
|
||||
@ -308,7 +308,7 @@ class RegressionResult:
|
||||
|
||||
# Fit individual logistic regressions
|
||||
logit_models = []
|
||||
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
|
||||
for upper_limit in sorted(df[dep].unique())[:-1]:
|
||||
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
|
||||
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
|
||||
|
||||
@ -454,7 +454,7 @@ class RegressionResult:
|
||||
self.full_name, self.model_name, self.fit_method,
|
||||
self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
|
||||
terms,
|
||||
self.llf, self.llnull,
|
||||
self.ll_model, self.ll_null,
|
||||
self.dof_resid, self.rsquared, self.f_statistic,
|
||||
self.comments,
|
||||
self.exp
|
||||
@ -499,8 +499,8 @@ class RegressionResult:
|
||||
# Otherwise report likelihood ratio test as overall test
|
||||
lrtest_result = self.lrtest_null()
|
||||
|
||||
right_col.append(('LL-Model:', format(self.llf, '.2f')))
|
||||
right_col.append(('LL-Null:', format(self.llnull, '.2f')))
|
||||
right_col.append(('LL-Model:', format(self.ll_model, '.2f')))
|
||||
right_col.append(('LL-Null:', format(self.ll_null, '.2f')))
|
||||
if html:
|
||||
right_col.append(('<i>p</i> (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML)))
|
||||
else:
|
||||
@ -859,9 +859,11 @@ def regress(
|
||||
# Single term
|
||||
terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
|
||||
# Fit null model (for llnull)
|
||||
if hasattr(result, 'llnull'):
|
||||
llnull = result.llnull
|
||||
# Fit null model (for ll_null)
|
||||
if hasattr(result, 'll_null'):
|
||||
ll_null = result.ll_null
|
||||
elif hasattr(result, 'llnull'):
|
||||
ll_null = result.llnull
|
||||
else:
|
||||
# Construct null (intercept-only) model
|
||||
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
|
||||
@ -870,7 +872,7 @@ def regress(
|
||||
dm_exog['Intercept'].fillna(1, inplace=True)
|
||||
|
||||
result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
|
||||
llnull = result_null.llf
|
||||
ll_null = result_null.llf
|
||||
|
||||
if model_class is sm.OLS:
|
||||
method_name = 'Least Squares'
|
||||
@ -897,7 +899,7 @@ def regress(
|
||||
full_name, model_class.__name__, method_name,
|
||||
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
|
||||
terms,
|
||||
result.llf, llnull,
|
||||
result.llf, ll_null,
|
||||
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
|
||||
[],
|
||||
exp
|
||||
@ -1030,7 +1032,7 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
|
||||
statsmodels-compatible model for computing ordinal logistic (or probit) regression
|
||||
|
||||
The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
|
||||
The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs.
|
||||
In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs.
|
||||
In this parameterisation, cutoff terms are represented directly in the model.
|
||||
|
||||
**Example:**
|
||||
|
@ -109,6 +109,7 @@ def as_2groups(df, data, group):
|
||||
raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
|
||||
|
||||
# Get 2 groups
|
||||
# FIXME: Sort order
|
||||
group1 = groups[0][0]
|
||||
data1 = df.loc[groups[0][1], data]
|
||||
group2 = groups[1][0]
|
||||
@ -118,7 +119,7 @@ def as_2groups(df, data, group):
|
||||
|
||||
def as_numeric(data):
|
||||
"""
|
||||
Convert the given data to a numeric type, factorising if required
|
||||
Convert the data to a numeric type, factorising if required
|
||||
|
||||
:param data: Data to convert
|
||||
:type df: Series
|
||||
|
Loading…
Reference in New Issue
Block a user