diff --git a/yli/__init__.py b/yli/__init__.py
index 61cf681..f5754b4 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -16,7 +16,7 @@
from .bayes_factors import bayesfactor_afbf
from .config import config
-from .descriptives import auto_descriptives
+from .descriptives import auto_correlations, auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
@@ -33,7 +33,7 @@ def reload_me():
try:
importlib.reload(v)
except ModuleNotFoundError as ex:
- if ex.name.startswith('yli.'):
+ if ex.name == k:
# Must be due to a module which we deleted - can safely ignore
pass
else:
diff --git a/yli/descriptives.py b/yli/descriptives.py
index a27b7ec..dadbe79 100644
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@@ -15,9 +15,11 @@
# along with this program. If not, see .
import pandas as pd
+from scipy import stats
+import seaborn as sns
from .config import config
-from .utils import check_nan
+from .utils import as_numeric, check_nan
def auto_descriptives(df, cols, *, ordinal_range=[]):
"""
@@ -140,3 +142,107 @@ class AutoDescriptivesResult:
result_labels_fmt = [r[0] for r in self._result_labels]
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
return str(table)
+
+def auto_correlations(df, cols):
+ # TODO: Documentation
+
+ def _col_to_numeric(col):
+ if col.dtype == 'category' and col.cat.ordered:
+ # Ordinal variable
+ # Factorise if required
+ col, _ = as_numeric(col)
+
+ # Code as ranks
+ col[col >= 0] = stats.rankdata(col[col >= 0])
+
+ # Put NaNs back
+ col = col.astype('float64')
+ col[col < 0] = pd.NA
+
+ return col
+ else:
+ # FIXME: Bools, binary, etc.
+ return col
+
+ # Code columns as numeric/ranks/etc. as appropriate
+ df_coded = pd.DataFrame(index=df.index)
+
+ for col_name in cols:
+ col = df[col_name]
+
+ if col.dtype == 'category' and col.cat.ordered:
+ # Ordinal variable
+ # Factorise if required
+ col, _ = as_numeric(col)
+
+ # Code as ranks
+ col[col >= 0] = stats.rankdata(col[col >= 0])
+
+ # Put NaNs back
+ col = col.astype('float64')
+ col[col < 0] = pd.NA
+
+ df_coded[col_name] = col
+ elif col.dtype in ('bool', 'boolean', 'category', 'object'):
+ cat_values = col.dropna().unique()
+
+ if len(cat_values) == 2:
+ # Categorical variable with 2 categories
+ # Code as 0/1/NA
+ cat_values = sorted(cat_values)
+ col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
+ df_coded[col_name] = col
+ else:
+ # Categorical variable with >2 categories
+ # Create dummy variables
+ dummies = pd.get_dummies(col, prefix=col_name)
+ df_coded = df_coded.join(dummies)
+ else:
+ # Numeric variable, etc.
+ df_coded[col_name] = col
+
+ # Compute pairwise correlation
+ df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')
+
+ for i, col1 in enumerate(df_coded.columns):
+ for col2 in df_coded.columns[:i]:
+ statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
+ df_corr.loc[col1, col2] = statistic
+ df_corr.loc[col2, col1] = statistic
+
+ # Correlation with itself is always 1
+ df_corr.loc[col1, col1] = 1
+
+ return AutoCorrelationsResult(df_corr)
+
+class AutoCorrelationsResult:
+ # TODO: Documentation
+
+ def __init__(self, correlations):
+ self.correlations = correlations
+
+ def __repr__(self):
+ if config.repr_is_summary:
+ return self.summary()
+ return super().__repr__()
+
+ def _repr_html_(self):
+ df_repr = self.correlations._repr_html_()
+
+ # Insert caption
+ idx_endopen = df_repr.index('>', df_repr.index('
Correlation Matrix' + df_repr[idx_endopen+1:]
+
+ return df_repr
+
+ def summary(self):
+ """
+ Return a stringified summary of the correlation matrix
+
+ :rtype: str
+ """
+
+ return 'Correlation Matrix\n\n' + str(self.correlations)
+
+ def plot(self):
+ sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')