From 57189990d61a3de954dce14085ccaa627234aed1 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Mon, 17 Oct 2022 21:41:19 +1100 Subject: [PATCH] Write Sphinx documentation --- .gitignore | 2 + README.md | 2 +- docs/Makefile | 20 +++++ docs/bayes_factors.rst | 13 +++ docs/conf.py | 30 +++++++ docs/distributions.rst | 16 ++++ docs/general.rst | 24 ++++++ docs/global.rst | 12 +++ docs/index.rst | 14 ++++ docs/io.rst | 10 +++ docs/make.bat | 35 ++++++++ docs/regress.rst | 28 +++++++ docs/sig_tests.rst | 36 +++++++++ yli/bayes_factors.py | 29 +++++-- yli/config.py | 12 +-- yli/distributions.py | 86 +++++++++++++++----- yli/io.py | 36 +++++++-- yli/regress.py | 145 +++++++++++++++++++++++++++------ yli/sig_tests.py | 179 +++++++++++++++++++++++++++++++++++++---- yli/utils.py | 18 +++++ 20 files changed, 670 insertions(+), 77 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/bayes_factors.rst create mode 100644 docs/conf.py create mode 100644 docs/distributions.rst create mode 100644 docs/general.rst create mode 100644 docs/global.rst create mode 100644 docs/index.rst create mode 100644 docs/io.rst create mode 100644 docs/make.bat create mode 100644 docs/regress.rst create mode 100644 docs/sig_tests.rst diff --git a/.gitignore b/.gitignore index 8d35cb3..3e21f69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ __pycache__ *.pyc + +/docs/_build diff --git a/README.md b/README.md index 1873293..2ea7597 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Relevant statistical functions are all directly available from the top-level *yl * Bayesian inference: * *bayesfactor_afbf*: Adjusted fractional Bayes factor for a hypothesis on parameters from regression -Each function is documented in the respective docstring within the source code. Examples can be found in the unit tests in the *tests* directory. +Each function is documented in the respective docstring within the source code, and Sphinx documentation is buildable from the *docs* directory. Examples can be found in the unit tests in the *tests* directory. ## Warning diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/bayes_factors.rst b/docs/bayes_factors.rst new file mode 100644 index 0000000..3fa0ccd --- /dev/null +++ b/docs/bayes_factors.rst @@ -0,0 +1,13 @@ +Bayesian inference +================== + +Functions +--------- + +.. autofunction:: yli.bayesfactor_afbf + +Result classes +-------------- + +.. autoclass:: yli.bayes_factors.BayesFactor + :members: diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..8c900b7 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,30 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import sys, os +sys.path.append(os.path.abspath('..')) + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'scipy-yli' +copyright = '2022, RunasSudo' +author = 'RunasSudo' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ['sphinx.ext.autodoc'] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] diff --git a/docs/distributions.rst b/docs/distributions.rst new file mode 100644 index 0000000..0f28cc8 --- /dev/null +++ b/docs/distributions.rst @@ -0,0 +1,16 @@ +Distributions +============= + +.. autodata:: yli.beta_oddsratio + + .. automethod:: yli.distributions.betaoddsrat_gen.from_scipy + + .. automethod:: yli.distributions.betaoddsrat_gen.set_cdf_terms + +.. autodata:: yli.beta_ratio + + .. automethod:: yli.distributions.betarat_gen.from_scipy + +.. autodata:: yli.transformed_dist + +.. autofunction:: yli.hdi diff --git a/docs/general.rst b/docs/general.rst new file mode 100644 index 0000000..a40a103 --- /dev/null +++ b/docs/general.rst @@ -0,0 +1,24 @@ +General specifications +====================== + +.. _nan-handling: + +NaN handling +------------ + +Most functions take a parameter **nan_policy** to specify how to handle *nan* values in the data. The options are: + +* **warn** (default) – Warn on *nan* values +* **raise** – Raise an error on *nan* values +* **omit** – Silently drop rows with *nan* values + +In determining whether there is *nan* in the data, only the columns specified in the function (if applicable) are considered. + +General result classes +---------------------- + +.. autoclass:: yli.utils.ConfidenceInterval + :members: + +.. autoclass:: yli.utils.Estimate + :members: diff --git a/docs/global.rst b/docs/global.rst new file mode 100644 index 0000000..d03331a --- /dev/null +++ b/docs/global.rst @@ -0,0 +1,12 @@ +Global options +============== + +.. autodata:: yli.config + + .. autoattribute:: yli.config.Config.alpha + + .. autoattribute:: yli.config.Config.pvalue_leading_zero + + .. autoattribute:: yli.config.Config.pvalue_max_dps + + .. autoattribute:: yli.config.Config.pvalue_min_dps diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..11d7987 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,14 @@ +scipy-yli API reference +======================= + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + general.rst + sig_tests.rst + regress.rst + io.rst + distributions.rst + bayes_factors.rst + global.rst diff --git a/docs/io.rst b/docs/io.rst new file mode 100644 index 0000000..88410ab --- /dev/null +++ b/docs/io.rst @@ -0,0 +1,10 @@ +Input/output +============ + +.. autofunction:: yli.pickle_read_compressed + +.. autofunction:: yli.pickle_read_encrypted + +.. autofunction:: yli.pickle_write_compressed + +.. autofunction:: yli.pickle_write_encrypted diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/regress.rst b/docs/regress.rst new file mode 100644 index 0000000..2648c74 --- /dev/null +++ b/docs/regress.rst @@ -0,0 +1,28 @@ +Regression +========== + +Functions +--------- + +.. autofunction:: yli.logit_then_regress + +.. autoclass:: yli.PenalisedLogit + +.. autofunction:: yli.regress + +.. autofunction:: yli.vif + +Result classes +-------------- + +.. autoclass:: yli.regress.CategoricalTerm + :members: + +.. autoclass:: yli.regress.LikelihoodRatioTestResult + :members: + +.. autoclass:: yli.regress.RegressionResult + :members: + +.. autoclass:: yli.regress.SingleTerm + :members: diff --git a/docs/sig_tests.rst b/docs/sig_tests.rst new file mode 100644 index 0000000..c5993da --- /dev/null +++ b/docs/sig_tests.rst @@ -0,0 +1,36 @@ +Significance tests +================== + +Functions +--------- + +.. autofunction:: yli.anova_oneway + +.. autofunction:: yli.chi2 + +.. autofunction:: yli.mannwhitney + +.. autofunction:: yli.pearsonr + +.. autofunction:: yli.ttest_ind + +Result classes +-------------- + +.. autoclass:: yli.sig_tests.BrunnerMunzelResult + :members: + +.. autoclass:: yli.sig_tests.FTestResult + :members: + +.. autoclass:: yli.sig_tests.MannWhitneyResult + :members: + +.. autoclass:: yli.sig_tests.PearsonChiSquaredResult + :members: + +.. autoclass:: yli.sig_tests.PearsonRResult + :members: + +.. autoclass:: yli.sig_tests.TTestResult + :members: diff --git a/yli/bayes_factors.py b/yli/bayes_factors.py index ed3cfa4..8b2851f 100644 --- a/yli/bayes_factors.py +++ b/yli/bayes_factors.py @@ -18,26 +18,45 @@ class BayesFactor: """A Bayes factor""" def __init__(self, factor, num_symbol, num_desc, denom_symbol, denom_desc): + #: Value of the Bayes factor (*float*) self.factor = factor + #: Symbol representing the hypothesis in the numerator (*str*) self.num_symbol = num_symbol + #: Description of the hypothesis in the numerator (*str*) self.num_desc = num_desc + #: Symbol representing the hypothesis in the denominator (*str*) self.denom_symbol = denom_symbol + #: Description of the hypothesis in the denominator (*str*) self.denom_desc = denom_desc def _repr_html_(self): return 'BF{0}{1} = {2:.2f}, {5}
H{0}: {3}
H{1}: {4}'.format(self.num_symbol, self.denom_symbol, self.factor, self.num_desc, self.denom_desc, self.interpret_lw(html=True)) def summary(self): + """ + Return a stringified summary of the Bayes factor + + :rtype: str + """ + return 'BF{0}{1} = {2:.2f}, {5}\nH{0}: {3}\nH{1}: {4}'.format(self.num_symbol, self.denom_symbol, self.factor, self.num_desc, self.denom_desc, self.interpret_lw(html=False)) def invert(self): - """Invert the Bayes factor""" + """ + Invert the Bayes factor + + :rtype: :class:`BayesFactor` + """ return BayesFactor(1 / self.factor, self.denom_symbol, self.denom_desc, self.num_symbol, self.num_desc) def interpret_lw(self, html): - """Interpret the Bayes factor according to the Lee & Wagenmakers classification scheme""" + """ + Interpret the Bayes factor according to the Lee & Wagenmakers classification scheme + + :meta private: + """ if self.factor == 1: return 'Evidence favours neither hypothesis to the other' @@ -64,11 +83,11 @@ class BayesFactor: def bayesfactor_afbf(params, cov, n, hypothesis): """ Compute an adjusted fractional Bayes factor for the hypothesis - Using R "BFpack" library - See R documentation for BFpack.BF + Uses R *BFpack* library. See R documentation for *BFpack.BF*. - Returns Bayes factor for hypothesis vs its complement + :return: Bayes factor for the hypothesis versus its complement + :rtype: :class:`yli.bayes_factors.BayesFactor` """ import rpy2.robjects as ro diff --git a/yli/config.py b/yli/config.py index 33e1ba1..6741f3a 100644 --- a/yli/config.py +++ b/yli/config.py @@ -15,18 +15,18 @@ # along with this program. If not, see . class Config: - """Stores global configuration for the library""" + """Global configuration for the library""" def __init__(self): - # Display at least this many decimal places for p values + #: Display at least this many decimal places for *p* values (*int*) self.pvalue_min_dps = 2 - # Display at most this many decimal places for p values + #: Display at most this many decimal places for *p* values (*int*) self.pvalue_max_dps = 3 - # Display a leading zero for p values + #: Display a leading zero for *p* values (*bool*) self.pvalue_leading_zero = True - # Alpha level for significance tests, confidence intervals + #: Alpha level for significance tests, confidence intervals (*float*) self.alpha = 0.05 -"""Global config singleton""" +"""Global configuration singleton""" config = Config() diff --git a/yli/distributions.py b/yli/distributions.py index b33dd20..cab6f0e 100644 --- a/yli/distributions.py +++ b/yli/distributions.py @@ -23,17 +23,31 @@ from .utils import ConfidenceInterval class betarat_gen(stats.rv_continuous): """ Ratio of 2 independent beta-distributed variables - Ratio of Beta(a1, b1) / Beta(a2, b2) - References: - 1. Pham-Gia T. Distributions of the ratios of independent beta variables and applications. Communications in Statistics: Theory and Methods. 2000;29(12):2693–715. doi: 10.1080/03610920008832632 - 2. Weekend Editor. On the ratio of Beta-distributed random variables. Some Weekend Reading. 2021 Sep 13. https://www.someweekendreading.blog/beta-ratios/ + This is a SciPy *stats.rv_continuous* distribution which takes 4 parameters, *a1*, *b1*, *a2* and *b2*, + and gives the distribution of Beta(*a1*, *b1*) / Beta(*a2*, *b2*). + + **References:** + + 1. Pham-Gia T. Distributions of the ratios of independent beta variables and applications. *Communications in Statistics: Theory and Methods*. 2000;29(12):2693–715. `doi: 10.1080/03610920008832632 `_ + + 2. Weekend Editor. On the ratio of Beta-distributed random variables. *Some Weekend Reading*. 2021 Sep 13. https://www.someweekendreading.blog/beta-ratios/ """ # TODO: _rvs based on stats.beta def from_scipy(self, beta1, beta2): - """Construct a new beta_ratio distribution from two SciPy beta distributions""" + """ + Construct a new *beta_ratio* distribution from two SciPy beta distributions + + :param beta1: Distribution for the numerator of the ratio + :type beta1: frozen SciPy stats.beta + :param beta2: Distribution for the denominator of the ratio + :type beta2: frozen SciPy stats.beta + + :rtype: Frozen *beta_ratio* distribution + """ + return self(*beta1.args, *beta2.args) def _do_vectorized(self, func, x, a1, b1, a2, b2): @@ -104,20 +118,38 @@ beta_ratio = betarat_gen(name='beta_ratio', a=0) # a = lower bound of support class betaoddsrat_gen(stats.rv_continuous): """ Ratio of the odds of 2 independent beta-distributed variables - Ratio of (X/(1-X)) / (Y/(1-Y)), where X ~ Beta(a1, b1), Y ~ Beta(a2, b2) - Reference: - Hora SC, Kelley GD. Bayesian inference on the odds and risk ratios. Communications in Statistics: Theory and Methods. 1983;12(6):725–38. doi: 10.1080/03610928308828491 + This is a SciPy *stats.rv_continuous* distribution which takes 4 parameters, *a1*, *b1*, *a2* and *b2*, + and gives the distribution of (*X*/(1-*X*)) / (*Y*/(1-*Y*)), where *X* ~ Beta(*a1*, *b1*), and *Y* ~ Beta(*a2*, *b2*). + + **Reference:** Hora SC, Kelley GD. Bayesian inference on the odds and risk ratios. *Communications in Statistics: Theory and Methods*. 1983;12(6):725–38. `doi: 10.1080/03610928308828491 `_ """ # TODO: _rvs based on stats.beta def from_scipy(self, beta1, beta2): - """Construct a new beta_ratio distribution from two SciPy beta distributions""" + """ + Construct a new *beta_oddsratio* distribution from two SciPy beta distributions + + :param beta1: Distribution for the numerator of the ratio + :type beta1: frozen SciPy stats.beta + :param beta2: Distribution for the denominator of the ratio + :type beta2: frozen SciPy stats.beta + + :rtype: Frozen *beta_oddsratio* distribution + """ + return self(*beta1.args, *beta2.args) def set_cdf_terms(self, cdf_terms): - """Set the number of terms to use when calculating CDF (see _cdf)""" + """ + Set the number of terms to use when calculating the CDF + + If *cdf_terms* = *np.inf* (default), the CDF will be computed by numerical integration of the PDF. + + Otherwise, the CDF will be computed by truncating the infinite sum given by Hora & Kelley to the specified number of terms. + """ + BETA_ODDSRATIO_CDF_TERMS[0] = cdf_terms def _do_vectorized(self, func, x, a1, b1, a2, b2): @@ -179,12 +211,7 @@ class betaoddsrat_gen(stats.rv_continuous): return 1 - k * inf_sum def _cdf(self, w, a1, b1, a2, b2): - """ - CDF for the distribution - - If BETA_ODDSRATIO_CDF_TERMS = np.inf, compute the CDF by integrating the PDF - Otherwise, compute the CDF by truncating the infinite sum given by Hora & Kelley - """ + """CDF for the distribution""" w = np.atleast_1d(w) a1 = np.atleast_1d(a1) @@ -259,10 +286,24 @@ beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0) # a = lower bound class transformed_gen(stats.rv_continuous): """ - Represents a transformation, Y, of a "base" random variable, X, where f(Y) = X - Hence Y.pdf(x) = X.pdf(f(x)) * f'(x) + Represents a transformation, *Y*, of a "base" random variable, *X* - e.g. if X is a model parameter, then transformed_dist(X, f=np.exp, fprime=np.exp, finv=np.log) is distribution of the log-parameter + This is a SciPy *stats.rv_continuous* distribution which takes parameters as described below. + + The transformation is *f*\ (*Y*) = *X*. + Hence *Y*.pdf\ (*x*) = *X*.pdf\ (*f*\ (*x*)) \* *f*'\ (*x*). + + For example, if *X* is a model parameter, then ``transformed_dist(X, f=np.exp, fprime=np.exp, finv=np.log)`` is the distribution of the log-parameter. + + **Parameters:** + + * **base** (*frozen SciPy distribution*) – Distribution of the base random variable + + * **f** (*callable*) – Function *f* representing the transformation, which takes values of *Y* to values of *X* + + * **fprime** (*callable*) – Derivative of the function *f* + + * **finv** (*callable*) – Inverse of the function *f*, which takes values of *X* to values of *Y* """ def _pdf(self, x, base, f, fprime, finv): @@ -297,7 +338,12 @@ def hdi(distribution, level=None): """ Get the highest density interval for the distribution, e.g. for a Bayesian posterior, the highest posterior density interval (HPD/HDI) - level: Coverage/confidence probability, default (None) is 1 - config.alpha + :param distribution: Distribution to compute the interval for + :type distribution: frozen SciPy distribution + :param level: Coverage/confidence probability, default (*None*) is 1 − *config.alpha* + :type level: float + + :rtype: :class:`yli.utils.ConfidenceInterval` """ if level is None: diff --git a/yli/io.py b/yli/io.py index 64d6ad7..fe50714 100644 --- a/yli/io.py +++ b/yli/io.py @@ -23,9 +23,16 @@ def pickle_write_encrypted(df, fname): """ Write the DataFrame to an encrypted file - DataFrame is serialised with Pickle, compressed with LZMA, then encrypted - Encryption is AES-256-CTR - Password is derived using scrypt KDF + Prompts the user for a password. + + The DataFrame is serialised with *pickle*, compressed with LZMA, then encrypted. + Encryption uses AES-256-CTR. + Encryption key is derived from the password using the scrypt KDF. + + :param df: Data to serialise + :type df: DataFrame + :param fname: Filename to write data to + :type fname: str """ from Crypto.Cipher import AES @@ -56,7 +63,12 @@ def pickle_write_compressed(df, fname): """ Write the DataFrame to a compressed file - DataFrame is serialised with Pickle and compressed with LZMA + The DataFrame is serialised with *pickle*, then compressed with LZMA. + + :param df: Data to serialise + :type df: DataFrame + :param fname: Filename to write data to + :type fname: str """ # Serialise Pandas and compress @@ -67,6 +79,13 @@ def pickle_write_compressed(df, fname): def pickle_read_encrypted(fname): """ Read a DataFrame from an encrypted file + + See :meth:`yli.pickle_write_encrypted`. + + :param fname: Filename to read data from + :type fname: str + + :rtype: DataFrame """ from Crypto.Cipher import AES @@ -91,7 +110,14 @@ def pickle_read_encrypted(fname): def pickle_read_compressed(fname): """ - Read a DataFrame from an compressed (but not encrypted) file + Read a DataFrame from a compressed file + + See :meth:`yli.pickle_write_compressed`. + + :param fname: Filename to read data from + :type fname: str + + :rtype: DataFrame """ # Uncompress and deserialise to Pandas diff --git a/yli/regress.py b/yli/regress.py index 3b7db67..f6e4382 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -30,11 +30,17 @@ from .config import config from .sig_tests import FTestResult from .utils import Estimate, check_nan, cols_for_formula, fmt_p, formula_factor_ref_category, parse_patsy_term -def vif(df, formula=None, nan_policy='warn'): +def vif(df, formula=None, *, nan_policy='warn'): """ - Calculate the variance inflation factor for each variable in df + Calculate the variance inflation factor (VIF) for each variable in *df* - formula: If specified, calculate the VIF only for the variables in the formula + :param df: Data to calculate the VIF for + :type df: DataFrame + :param formula: If specified, calculate the VIF only for the variables in the formula + :type formula: str + + :return: The variance inflation factor + :rtype: float """ if formula: @@ -63,24 +69,37 @@ def vif(df, formula=None, nan_policy='warn'): # Regression class LikelihoodRatioTestResult: - """Result of a likelihood ratio test for regression""" + """ + Result of a likelihood ratio test for regression + + See :meth:`RegressionResult.lrtest_null`. + """ def __init__(self, statistic, dof, pvalue): + #: Likelihood ratio test statistic (*float*) self.statistic = statistic + #: Degrees of freedom for the likelihood ratio test statistic (*int*) self.dof = dof + #: *p* value for the likelihood ratio test (*float*) self.pvalue = pvalue def _repr_html_(self): return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): + """ + Return a stringified summary of the likelihood ratio test + + :rtype: str + """ + return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False)) class RegressionResult: """ Result of a regression - llf/llnull: Log-likelihood of model/null model + See :func:`yli.regress`. """ def __init__(self, @@ -92,44 +111,63 @@ class RegressionResult: dof_resid, rsquared, f_statistic, exp ): - # A copy of the raw result so we can access it + #: Raw result from statsmodels *model.fit* self.raw_result = raw_result # Information for display + #: Full name of the regression model type (*str*) self.full_name = full_name + #: Short name of the regression model type (*str*) self.model_name = model_name + #: Method for fitting the regression model (*str*) self.fit_method = fit_method # Basic fitted model information + #: Name of the dependent variable (*str*) self.dep = dep + #: Number of observations (*int*) self.nobs = nobs + #: Degrees of freedom for the model (*int*) self.dof_model = dof_model + #: Date and time of fitting the model (Python *datetime*) self.fitted_dt = fitted_dt + #: Method for computing the covariance matrix (*str*) self.cov_type = cov_type # Regression coefficients/p values + #: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`) self.terms = terms # Model log-likelihood + #: Log-likelihood of fitted model (*float*) self.llf = llf + #: Log-likelihood of null model (*float*) self.llnull = llnull # Extra statistics (not all regression models have these) + #: Degrees of freedom for the residuals (*int*; *None* if N/A) self.dof_resid = dof_resid + #: *R*:sup:`2` statistic (*float*; *None* if N/A) self.rsquared = rsquared + #: *F* statistic (*float*; *None* if N/A) self.f_statistic = f_statistic # Config for display style + #: See :func:`yli.regress` self.exp = exp @property def pseudo_rsquared(self): - """McFadden's pseudo R-squared""" + """McFadden's pseudo *R*:sup:`2` statistic""" return 1 - self.llf/self.llnull def lrtest_null(self): - """Compute the likelihood ratio test comparing the model to the null model""" + """ + Compute the likelihood ratio test comparing the model to the null model + + :rtype: :class:`LikelihoodRatioTestResult` + """ statistic = -2 * (self.llnull - self.llf) pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) @@ -137,17 +175,28 @@ class RegressionResult: return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) def ftest(self): - """Perform the F test that all slopes are 0""" + """ + Perform the *F* test that all slopes are 0 + + :rtype: :class:`yli.sig_tests.FTestResult` + """ pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic) return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue) def bayesfactor_beta_zero(self, term): """ - Compute Bayes factor testing the hypothesis that the given beta is 0 - Requires statsmodels regression + Compute a Bayes factor testing the hypothesis that the given beta is 0 - term: Raw name (from statsmodels, available via RegressionResult.raw_result) of term to be tested + Uses the R *BFpack* library. + + Requires the regression to be from statsmodels. + The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`. + + :param term: Raw name of the term to be tested + :type term: str + + :rtype: :class:`yli.bayes_factors.BayesFactor` """ # FIXME: Allow specifying our renamed terms @@ -264,6 +313,12 @@ class RegressionResult: return out def summary(self): + """ + Return a stringified summary of the regression model + + :rtype: str + """ + # Render header table left_col, right_col = self._header_table(html=False) @@ -325,24 +380,23 @@ class RegressionResult: return out class SingleTerm: - """ - A term in a RegressionResult which is a single term - - raw_name: The raw name of the term (e.g. in RegressionResult.raw_result data) - beta: An Estimate of the coefficient - pvalue: The p value - """ + """A term in a :class:`RegressionResult` which is a single term""" def __init__(self, raw_name, beta, pvalue): + #: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`) self.raw_name = raw_name + #: :class:`yli.utils.Estimate` of the coefficient self.beta = beta + #: *p* value for the coefficient (*float*) self.pvalue = pvalue class CategoricalTerm: - """A group of terms in a RegressionResult corresponding to a categorical variable""" + """A group of terms in a :class:`RegressionResult` corresponding to a categorical variable""" def __init__(self, categories, ref_category): + #: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`) self.categories = categories + #: Name of the reference category (*str*) self.ref_category = ref_category def regress( @@ -356,8 +410,30 @@ def regress( """ Fit a statsmodels regression model - bool_baselevels: Show reference categories for boolean independent variables even if reference category is False - exp: Report exponentiated parameters rather than raw parameters + :param model_class: Type of regression model to fit + :type model_class: statsmodels model class + :param df: Data to perform regression on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param formula: Patsy formula for the regression model + :type formula: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + :param model_kwargs: Keyword arguments to pass to *model_class* constructor + :type model_kwargs: dict + :param fit_kwargs: Keyword arguments to pass to *model.fit* + :type fit_kwargs: dict + :param family: See statsmodels *GLM* constructor + :param cov_type: See statsmodels *model.fit* + :param maxiter: See statsmodels *model.fit* + :param start_params: See statsmodels *model.fit* + :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False* + :type bool_baselevels: bool + :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class* + :type exp: bool + + :rtype: :class:`yli.regress.RegressionResult` """ # Populate model_kwargs @@ -473,7 +549,23 @@ def regress( ) def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs): - """Perform logistic regression, then use parameters as start parameters for desired regression""" + """ + Perform logistic regression, then use parameters as start parameters for desired regression + + :param model_class: Type of regression model to fit + :type model_class: statsmodels model class + :param df: Data to perform regression on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param formula: Patsy formula for the regression model + :type formula: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + :param kwargs: Passed through to :func:`yli.regress` + + :rtype: :class:`yli.regress.RegressionResult` + """ # Check for/clean NaNs # Do this once here so we only get 1 warning @@ -496,10 +588,11 @@ def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kw class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): """ - Statsmodel-compatible model for computing Firth penalised logistic regression - Uses R "logistf" library + statsmodel-compatible model for computing Firth penalised logistic regression - NB: This class expects to be used in the context of yli.regress() + Uses the R *logistf* library. + + This class should be used in conjunction with :func:`yli.regress`. """ def fit(self, disp=False): diff --git a/yli/sig_tests.py b/yli/sig_tests.py index 368cc2c..1f3e917 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -30,26 +30,50 @@ from .utils import Estimate, as_2groups, check_nan, fmt_p class TTestResult: """ - Result of a Student's t test + Result of a Student's *t* test - delta: Mean difference + See :func:`yli.ttest_ind`. """ def __init__(self, statistic, dof, pvalue, delta, delta_direction): + #: *t* statistic (*float*) self.statistic = statistic + #: Degrees of freedom of the *t* distribution (*int*) self.dof = dof + #: *p* value for the *t* statistic (*float*) self.pvalue = pvalue + #: Absolute value of the mean difference (:class:`yli.utils.Estimate`) self.delta = delta + #: Description of the direction of the effect (*str*) self.delta_direction = delta_direction def _repr_html_(self): return 't({:.0f}) = {:.2f}; p {}
Δμ ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary(self): + """ + Return a stringified summary of the *t* test + + :rtype: str + """ + return 't({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def ttest_ind(df, dep, ind, *, nan_policy='warn'): - """Perform an independent-sample Student's t test""" + """ + Perform an independent 2-sample Student's *t* test + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param ind: Column in *df* for the independent variable (dichotomous) + :type ind: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + + :rtype: :class:`yli.sig_tests.TTestResult` + """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) @@ -78,22 +102,49 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'): # One-way ANOVA class FTestResult: - """Result of an F test for ANOVA/regression""" + """ + Result of an *F* test for ANOVA/regression + + See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionResult.ftest`. + """ def __init__(self, statistic, dof1, dof2, pvalue): + #: *F* statistic (*float*) self.statistic = statistic + #: Degrees of freedom in the *F* distribution numerator (*int*) self.dof1 = dof1 + #: Degrees of freedom in the *F* distribution denominator (*int*) self.dof2 = dof2 + #: *p* value for the *F* statistic (*float*) self.pvalue = pvalue def _repr_html_(self): return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): + """ + Return a stringified summary of the *F* test + + :rtype: str + """ + return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) def anova_oneway(df, dep, ind, *, nan_policy='omit'): - """Perform one-way ANOVA""" + """ + Perform one-way ANOVA + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param ind: Column in *df* for the independent variable (categorical) + :type ind: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + + :rtype: :class:`yli.sig_tests.FTestResult` + """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) @@ -115,16 +166,21 @@ def anova_oneway(df, dep, ind, *, nan_policy='omit'): class MannWhitneyResult: """ - Result of a Mann-Whitney test + Result of a Mann-Whitney *U* test - brunnermunzel: BrunnerMunzelResult on same data + See :func:`yli.mannwhitney`. """ def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None): + #: Mann–Whitney *U* statistic (*float*) self.statistic = statistic + #: *p* value for the *U* statistic (*float*) self.pvalue = pvalue + #: Absolute value of the rank-biserial correlation (*float*) self.rank_biserial = rank_biserial + #: Description of the direction of the effect (*str*) self.direction = direction + #: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A self.brunnermunzel = brunnermunzel def _repr_html_(self): @@ -135,6 +191,12 @@ class MannWhitneyResult: return line1 def summary(self): + """ + Return a stringified summary of the Mann–Whitney test + + :rtype: str + """ + line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() @@ -142,24 +204,55 @@ class MannWhitneyResult: return line1 class BrunnerMunzelResult: + """ + Result of a Brunner–Munzel test + + See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*. + """ + """Result of a Brunner-Munzel test""" def __init__(self, statistic, pvalue): + #: *W* statistic (*float*) self.statistic = statistic + #: *p* value for the *W* statistic (*float*) self.pvalue = pvalue def _repr_html_(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): + """ + Return a stringified summary of the Brunner–Munzel test + + :rtype: str + """ + return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False)) def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'): """ - Perform a Mann-Whitney test + Perform a Mann-Whitney *U* test - brunnermunzel: Set to False to skip the Brunner-Munzel test - use_continuity, alternative, method: See scipy.stats.mannwhitneyu + By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant. + If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised. + The Brunner–Munzel test is returned only if non-significant. + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param ind: Column in *df* for the independent variable (dichotomous) + :type ind: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + :param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant + :type brunnermunzel: bool + :param use_continuity: See *scipy.stats.mannwhitneyu* + :param alternative: See *scipy.stats.mannwhitneyu* + :param method: See *scipy.stats.mannwhitneyu* + + :rtype: :class:`yli.sig_tests.MannWhitneyResult` """ # Check for/clean NaNs @@ -198,14 +291,24 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont # Pearson chi-squared test class PearsonChiSquaredResult: - """Result of a Pearson chi-squared test""" + """ + Result of a Pearson *χ*:sup:`2` test + + See :func:`yli.chi2`. + """ def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): + #: Contingency table for the observations (*DataFrame*) self.ct = ct + #: *χ*:sup:`2` statistic (*float*) self.statistic = statistic + #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) self.dof = dof + #: *p* value for the *χ*:sup:`2` test (*float*) self.pvalue = pvalue + #: Odds ratio (*float*; *None* if not a 2×2 table) self.oddsratio = oddsratio + #: Risk ratio (*float*; *None* if not a 2×2 table) self.riskratio = riskratio def _repr_html_(self): @@ -217,6 +320,12 @@ class PearsonChiSquaredResult: self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): + """ + Return a stringified summary of the *χ*:sup:`2` test + + :rtype: str + """ + if self.oddsratio is not None: return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) @@ -225,7 +334,24 @@ class PearsonChiSquaredResult: self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False)) def chi2(df, dep, ind, *, nan_policy='warn'): - """Perform a Pearson chi-squared test""" + """ + Perform a Pearson *χ*:sup:`2` test + + If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. + The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean). + The risk ratio is calculated relative to the independent variable. + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (categorical) + :type dep: str + :param ind: Column in *df* for the independent variable (categorical) + :type ind: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + + :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` + """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) @@ -266,20 +392,45 @@ def chi2(df, dep, ind, *, nan_policy='warn'): # Pearson correlation class PearsonRResult: - """Result of Pearson correlation""" + """ + Result of Pearson correlation + + See :func:`yli.pearsonr`. + """ def __init__(self, statistic, pvalue): + #: Pearson *r* correlation statistic (*float*) self.statistic = statistic + #: *p* value for the *r* statistic (*float*) self.pvalue = pvalue def _repr_html_(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=True)) def summary(self): + """ + Return a stringified summary of the Pearson correlation + + :rtype: str + """ + return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=False)) def pearsonr(df, dep, ind, *, nan_policy='warn'): - """Compute the Pearson correlation coefficient (Pearson's r)""" + """ + Compute the Pearson correlation coefficient (Pearson's r) + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numerical) + :type dep: str + :param ind: Column in *df* for the independent variable (numerical) + :type ind: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + + :rtype: :class:`yli.sig_tests.PearsonRResult` + """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) diff --git a/yli/utils.py b/yli/utils.py index d8a1043..c58dd9d 100644 --- a/yli/utils.py +++ b/yli/utils.py @@ -143,26 +143,44 @@ class ConfidenceInterval: """A confidence interval""" def __init__(self, lower, upper): + #: Lower confidence limit (*float*) self.lower = lower + #: Upper confidence limit (*float*) self.upper = upper def _repr_html_(self): return self.summary() def summary(self): + """ + Return a stringified summary of the confidence interval + + :rtype: str + """ + return '{:.2f}–{:.2f}'.format(self.lower, self.upper) + class Estimate: """A point estimate and surrounding confidence interval""" def __init__(self, point, ci_lower, ci_upper): + #: Point estimate (*float*) self.point = point + #: Lower confidence limit (*float*) self.ci_lower = ci_lower + #: Upper confidence limit (*float*) self.ci_upper = ci_upper def _repr_html_(self): return self.summary() def summary(self): + """ + Return a stringified summary of the estimate and confidence interval + + :rtype: str + """ + return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper) def __neg__(self):