Automatically factorise pandas categorical when required for regression
This commit is contained in:
parent
04643d312c
commit
2135796d85
@ -33,7 +33,7 @@ import weakref
|
|||||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||||
from .config import config
|
from .config import config
|
||||||
from .sig_tests import ChiSquaredResult, FTestResult
|
from .sig_tests import ChiSquaredResult, FTestResult
|
||||||
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||||
|
|
||||||
def vif(df, formula=None, *, nan_policy='warn'):
|
def vif(df, formula=None, *, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
@ -270,8 +270,7 @@ class RegressionResult:
|
|||||||
df = check_nan(df, 'omit')
|
df = check_nan(df, 'omit')
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
# Ensure numeric type for dependent variable
|
||||||
if df[dep].dtype != 'float64':
|
df[dep], dep_categories = as_numeric(df[dep])
|
||||||
df[dep] = df[dep].astype('float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
df = convert_pandas_nullable(df)
|
df = convert_pandas_nullable(df)
|
||||||
@ -373,8 +372,7 @@ class RegressionResult:
|
|||||||
df = check_nan(df, 'omit')
|
df = check_nan(df, 'omit')
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
# Ensure numeric type for dependent variable
|
||||||
if df[dep].dtype != 'float64':
|
df[dep], dep_categories = as_numeric(df[dep])
|
||||||
df[dep] = df[dep].astype('float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
df = convert_pandas_nullable(df)
|
df = convert_pandas_nullable(df)
|
||||||
@ -748,8 +746,7 @@ def regress(
|
|||||||
df = check_nan(df, nan_policy)
|
df = check_nan(df, nan_policy)
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
# Ensure numeric type for dependent variable
|
||||||
if df[dep].dtype != 'float64':
|
df[dep], dep_categories = as_numeric(df[dep])
|
||||||
df[dep] = df[dep].astype('float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
df = convert_pandas_nullable(df)
|
df = convert_pandas_nullable(df)
|
||||||
@ -806,7 +803,15 @@ def regress(
|
|||||||
# Group ordinal regression cutoffs
|
# Group ordinal regression cutoffs
|
||||||
if '(Cutoffs)' not in terms:
|
if '(Cutoffs)' not in terms:
|
||||||
terms['(Cutoffs)'] = CategoricalTerm({}, None)
|
terms['(Cutoffs)'] = CategoricalTerm({}, None)
|
||||||
terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
|
||||||
|
if dep_categories is None:
|
||||||
|
term = raw_name
|
||||||
|
else:
|
||||||
|
# Need to convert factorised names back into original names
|
||||||
|
bits = raw_name.split('/')
|
||||||
|
term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))]
|
||||||
|
|
||||||
|
terms['(Cutoffs)'].categories[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||||
else:
|
else:
|
||||||
# Parse if required
|
# Parse if required
|
||||||
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
||||||
|
18
yli/utils.py
18
yli/utils.py
@ -116,6 +116,24 @@ def as_2groups(df, data, group):
|
|||||||
|
|
||||||
return group1, data1, group2, data2
|
return group1, data1, group2, data2
|
||||||
|
|
||||||
|
def as_numeric(data):
|
||||||
|
"""
|
||||||
|
Convert the given data to a numeric type, factorising if required
|
||||||
|
|
||||||
|
:param data: Data to convert
|
||||||
|
:type df: Series
|
||||||
|
|
||||||
|
:return: See *pandas.factorize*
|
||||||
|
"""
|
||||||
|
|
||||||
|
if data.dtype == 'float64':
|
||||||
|
return data, None
|
||||||
|
|
||||||
|
if data.dtype == 'category' and data.cat.categories.dtype == 'object':
|
||||||
|
return data.factorize(sort=True)
|
||||||
|
|
||||||
|
return data.astype('float64'), None
|
||||||
|
|
||||||
# ----------
|
# ----------
|
||||||
# Formatting
|
# Formatting
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user