Update documentation

This commit is contained in:
RunasSudo 2022-11-09 17:05:04 +11:00
parent 50597ddc74
commit c6cef4aee7
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
5 changed files with 103 additions and 12 deletions

View File

@ -14,6 +14,8 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va
In determining whether there is *nan* in the data, only the columns specified in the function (if applicable) are considered.
.. autofunction:: yli.utils.check_nan
General result classes
----------------------

View File

@ -12,3 +12,4 @@ scipy-yli API reference
distributions.rst
bayes_factors.rst
global.rst
internal.rst

23
docs/internal.rst Normal file
View File

@ -0,0 +1,23 @@
Internal utilities
==================
Data wrangling
--------------
.. autofunction:: yli.utils.as_2groups
.. autofunction:: yli.utils.convert_pandas_nullable
*p* values
----------
.. autofunction:: yli.utils.fmt_p
Formula manipulation
--------------------
.. autofunction:: yli.utils.cols_for_formula
.. autofunction:: yli.utils.formula_factor_ref_category
.. autofunction:: yli.utils.parse_patsy_term

View File

@ -325,9 +325,7 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
:param alternative: See *scipy.stats.mannwhitneyu*
:param method: See *scipy.stats.mannwhitneyu*
:return: The result of the MannWhitney test.
The result of a BrunnerMunzel test is included in the result object if and only if *brunnermunzel* is *True*,
*and* the MannWhitney test is significant, *and* the BrunnerMunzel test is non-significant.
:return: The result of the MannWhitney test. The result of a BrunnerMunzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the MannWhitney test is significant, *and* the BrunnerMunzel test is non-significant.
:rtype: :class:`yli.sig_tests.MannWhitneyResult`

View File

@ -26,7 +26,17 @@ from .config import config
# Data cleaning and validation
def check_nan(df, nan_policy):
"""Check df against nan_policy and return cleaned input"""
"""
Check df against *nan_policy* and return cleaned input
:param df: Data to check for NaNs
:type df: DataFrame
:param nan_policy: Policy to apply when encountering NaN values (*warn*, *raise*, *omit*)
:type nan_policy: str
:return: Data with NaNs removed, which may or may not be copied
:rtype: DataFrame
"""
if nan_policy == 'raise':
if pd.isna(df).any(axis=None):
@ -43,7 +53,17 @@ def check_nan(df, nan_policy):
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
def convert_pandas_nullable(df):
"""Convert pandas nullable dtypes (e.g. Int64) to non-nullable numpy dtypes"""
"""
Convert pandas nullable dtypes (e.g. *Int64*) to non-nullable numpy dtypes
Behaviour on encountering *NA* values is undefined, so the data should be passed through :func:`check_nan` first.
:param df: Data to check for pandas nullable dtypes
:type df: DataFrame
:return: Data with pandas nullable dtypes converted, which may or may not be copied
:rtype: DataFrame
"""
# Avoid copy if possible
df_cleaned = None
@ -59,7 +79,19 @@ def convert_pandas_nullable(df):
return df_cleaned
def as_2groups(df, data, group):
"""Group the data by the given variable, ensuring only 2 groups"""
"""
Group the data by the given variable, asserting only 2 groups
:param df: Data to group
:type df: DataFrame
:param group: Column to group by
:type group: str
:return: (*group1*, *data1*, *group2*, *data2*)
* **group1**, **group2** (*str*) The 2 values of the grouping variable
* **data1**, **data2** (*DataFrame*) The 2 corresponding subsets of *df*
"""
# Get groupings
groups = list(df.groupby(group).groups.items())
@ -115,9 +147,19 @@ def do_fmt_p(p):
def fmt_p(p, *, html, only_value=False, tabular=False):
"""
Format p value
Format *p* value for display
tabular: If true, output in tabular format of p values where decimal points align
:param p: *p* value to display
:type p: float
:param html: Whether to output as HTML (*True*) or plaintext (*False*)
:type html: bool
:param only_value: Whether to display only the value (*True*, e.g. ``0.04``, ``<0.001``) or equality symbol and value (*False*, e.g. ``= 0.04``, ``< 0.001``)
:type only_value: bool
:param tabular: Whether to pad with spaces so that decimal points align
:type tabular: bool
:return: Formatted *p* value
:rtype: str
"""
# FIXME: Make only_value and tabular enums
@ -250,7 +292,17 @@ class Estimate:
# Patsy formula manipulation
def cols_for_formula(formula, df):
"""Return the columns corresponding to the Patsy formula"""
"""
Return the columns corresponding to the Patsy formula
:param formula: Patsy formula to parse
:type formula: str
:param df: Data to apply the formula on
:type df: DataFrame
:return: Columns in (the right-hand side of) the formula
:rtype: List[str]
"""
# Parse the formula
model_desc = patsy.ModelDesc.from_formula(formula)
@ -286,7 +338,17 @@ def formula_get_factor_info(formula, df, factor):
return factor_info
def formula_factor_ref_category(formula, df, factor):
"""Get the reference category for a term in a Patsy formula referring to a categorical factor"""
"""
Get the reference category for a term in a Patsy formula referring to a categorical factor
:param formula: Patsy formula to parse
:type formula: str
:param df: Data to apply the formula on
:type df: DataFrame
:param factor: Factor to determine reference category for (e.g. ``Country``, ``C(Country)``, ``C(Country, Treatment)``, ``C(Country, Treatment("Australia"))``)
:return: Reference category for the specified factor
"""
if '(' in factor and not factor.startswith('C('):
raise Exception('Attempted to get reference category for unknown expression type "{}"'.format(factor))
@ -319,8 +381,13 @@ def parse_patsy_term(formula, df, term):
"""
Parse a Patsy term into its component parts
Returns: factor, column, contrast
e.g. "C(x, Treatment(y))[T.z]" -> "C(x, Treatment(y))", "x", "z"
**Example:** The term ``"C(x, Treatment(y))[T.z]"`` parses to ``("C(x, Treatment(y))", "x", "z")``.
:return: (*factor*, *column*, *contrast*)
* **factor** (*str*) Name of the factor, as specified in the Patsy formula
* **column** (*str*) Name of the DataFrame column corresponding to the factor
* **contrast** (*str*) Name of the contrast for the factor, or *None* if not applicable
"""
if '(' not in term: