Automatically factorise pandas categorical when required for regression

This commit is contained in:
RunasSudo 2022-12-02 21:42:41 +11:00
parent 04643d312c
commit 2135796d85
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
2 changed files with 31 additions and 8 deletions

View File

@ -33,7 +33,7 @@ import weakref
from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config
from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
def vif(df, formula=None, *, nan_policy='warn'):
"""
@ -270,8 +270,7 @@ class RegressionResult:
df = check_nan(df, 'omit')
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
df[dep], dep_categories = as_numeric(df[dep])
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
@ -373,8 +372,7 @@ class RegressionResult:
df = check_nan(df, 'omit')
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
df[dep], dep_categories = as_numeric(df[dep])
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
@ -748,8 +746,7 @@ def regress(
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
df[dep], dep_categories = as_numeric(df[dep])
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
@ -806,7 +803,15 @@ def regress(
# Group ordinal regression cutoffs
if '(Cutoffs)' not in terms:
terms['(Cutoffs)'] = CategoricalTerm({}, None)
terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name])
if dep_categories is None:
term = raw_name
else:
# Need to convert factorised names back into original names
bits = raw_name.split('/')
term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))]
terms['(Cutoffs)'].categories[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
else:
# Parse if required
factor, column, contrast = parse_patsy_term(formula, df, raw_name)

View File

@ -116,6 +116,24 @@ def as_2groups(df, data, group):
return group1, data1, group2, data2
def as_numeric(data):
"""
Convert the given data to a numeric type, factorising if required
:param data: Data to convert
:type df: Series
:return: See *pandas.factorize*
"""
if data.dtype == 'float64':
return data, None
if data.dtype == 'category' and data.cat.categories.dtype == 'object':
return data.factorize(sort=True)
return data.astype('float64'), None
# ----------
# Formatting