Automatically factorise pandas categorical when required for regression
This commit is contained in:
parent
04643d312c
commit
2135796d85
@ -33,7 +33,7 @@ import weakref
|
||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||
from .config import config
|
||||
from .sig_tests import ChiSquaredResult, FTestResult
|
||||
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
|
||||
def vif(df, formula=None, *, nan_policy='warn'):
|
||||
"""
|
||||
@ -270,8 +270,7 @@ class RegressionResult:
|
||||
df = check_nan(df, 'omit')
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
df[dep], dep_categories = as_numeric(df[dep])
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
@ -373,8 +372,7 @@ class RegressionResult:
|
||||
df = check_nan(df, 'omit')
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
df[dep], dep_categories = as_numeric(df[dep])
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
@ -748,8 +746,7 @@ def regress(
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
df[dep], dep_categories = as_numeric(df[dep])
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
@ -806,7 +803,15 @@ def regress(
|
||||
# Group ordinal regression cutoffs
|
||||
if '(Cutoffs)' not in terms:
|
||||
terms['(Cutoffs)'] = CategoricalTerm({}, None)
|
||||
terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
|
||||
if dep_categories is None:
|
||||
term = raw_name
|
||||
else:
|
||||
# Need to convert factorised names back into original names
|
||||
bits = raw_name.split('/')
|
||||
term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))]
|
||||
|
||||
terms['(Cutoffs)'].categories[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
else:
|
||||
# Parse if required
|
||||
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
||||
|
18
yli/utils.py
18
yli/utils.py
@ -116,6 +116,24 @@ def as_2groups(df, data, group):
|
||||
|
||||
return group1, data1, group2, data2
|
||||
|
||||
def as_numeric(data):
|
||||
"""
|
||||
Convert the given data to a numeric type, factorising if required
|
||||
|
||||
:param data: Data to convert
|
||||
:type df: Series
|
||||
|
||||
:return: See *pandas.factorize*
|
||||
"""
|
||||
|
||||
if data.dtype == 'float64':
|
||||
return data, None
|
||||
|
||||
if data.dtype == 'category' and data.cat.categories.dtype == 'object':
|
||||
return data.factorize(sort=True)
|
||||
|
||||
return data.astype('float64'), None
|
||||
|
||||
# ----------
|
||||
# Formatting
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user