diff --git a/yli/regress.py b/yli/regress.py index 404303f..781f7b3 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -33,7 +33,7 @@ import weakref from .bayes_factors import BayesFactor, bayesfactor_afbf from .config import config from .sig_tests import ChiSquaredResult, FTestResult -from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term +from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term def vif(df, formula=None, *, nan_policy='warn'): """ @@ -270,8 +270,7 @@ class RegressionResult: df = check_nan(df, 'omit') # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') + df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) @@ -373,8 +372,7 @@ class RegressionResult: df = check_nan(df, 'omit') # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') + df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) @@ -748,8 +746,7 @@ def regress( df = check_nan(df, nan_policy) # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') + df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) @@ -806,7 +803,15 @@ def regress( # Group ordinal regression cutoffs if '(Cutoffs)' not in terms: terms['(Cutoffs)'] = CategoricalTerm({}, None) - terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name]) + + if dep_categories is None: + term = raw_name + else: + # Need to convert factorised names back into original names + bits = raw_name.split('/') + term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))] + + terms['(Cutoffs)'].categories[term] = SingleTerm(raw_name, beta, pvalues[raw_name]) else: # Parse if required factor, column, contrast = parse_patsy_term(formula, df, raw_name) diff --git a/yli/utils.py b/yli/utils.py index 678821c..9641ede 100644 --- a/yli/utils.py +++ b/yli/utils.py @@ -116,6 +116,24 @@ def as_2groups(df, data, group): return group1, data1, group2, data2 +def as_numeric(data): + """ + Convert the given data to a numeric type, factorising if required + + :param data: Data to convert + :type df: Series + + :return: See *pandas.factorize* + """ + + if data.dtype == 'float64': + return data, None + + if data.dtype == 'category' and data.cat.categories.dtype == 'object': + return data.factorize(sort=True) + + return data.astype('float64'), None + # ---------- # Formatting