Implement yli.auto_correlations
This commit is contained in:
parent
5dce873e55
commit
c2d4aaf8be
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
from .bayes_factors import bayesfactor_afbf
|
from .bayes_factors import bayesfactor_afbf
|
||||||
from .config import config
|
from .config import config
|
||||||
from .descriptives import auto_descriptives
|
from .descriptives import auto_correlations, auto_descriptives
|
||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
|
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
|
||||||
@ -33,7 +33,7 @@ def reload_me():
|
|||||||
try:
|
try:
|
||||||
importlib.reload(v)
|
importlib.reload(v)
|
||||||
except ModuleNotFoundError as ex:
|
except ModuleNotFoundError as ex:
|
||||||
if ex.name.startswith('yli.'):
|
if ex.name == k:
|
||||||
# Must be due to a module which we deleted - can safely ignore
|
# Must be due to a module which we deleted - can safely ignore
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -15,9 +15,11 @@
|
|||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from scipy import stats
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
from .config import config
|
from .config import config
|
||||||
from .utils import check_nan
|
from .utils import as_numeric, check_nan
|
||||||
|
|
||||||
def auto_descriptives(df, cols, *, ordinal_range=[]):
|
def auto_descriptives(df, cols, *, ordinal_range=[]):
|
||||||
"""
|
"""
|
||||||
@ -140,3 +142,107 @@ class AutoDescriptivesResult:
|
|||||||
result_labels_fmt = [r[0] for r in self._result_labels]
|
result_labels_fmt = [r[0] for r in self._result_labels]
|
||||||
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
|
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
|
||||||
return str(table)
|
return str(table)
|
||||||
|
|
||||||
|
def auto_correlations(df, cols):
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
def _col_to_numeric(col):
|
||||||
|
if col.dtype == 'category' and col.cat.ordered:
|
||||||
|
# Ordinal variable
|
||||||
|
# Factorise if required
|
||||||
|
col, _ = as_numeric(col)
|
||||||
|
|
||||||
|
# Code as ranks
|
||||||
|
col[col >= 0] = stats.rankdata(col[col >= 0])
|
||||||
|
|
||||||
|
# Put NaNs back
|
||||||
|
col = col.astype('float64')
|
||||||
|
col[col < 0] = pd.NA
|
||||||
|
|
||||||
|
return col
|
||||||
|
else:
|
||||||
|
# FIXME: Bools, binary, etc.
|
||||||
|
return col
|
||||||
|
|
||||||
|
# Code columns as numeric/ranks/etc. as appropriate
|
||||||
|
df_coded = pd.DataFrame(index=df.index)
|
||||||
|
|
||||||
|
for col_name in cols:
|
||||||
|
col = df[col_name]
|
||||||
|
|
||||||
|
if col.dtype == 'category' and col.cat.ordered:
|
||||||
|
# Ordinal variable
|
||||||
|
# Factorise if required
|
||||||
|
col, _ = as_numeric(col)
|
||||||
|
|
||||||
|
# Code as ranks
|
||||||
|
col[col >= 0] = stats.rankdata(col[col >= 0])
|
||||||
|
|
||||||
|
# Put NaNs back
|
||||||
|
col = col.astype('float64')
|
||||||
|
col[col < 0] = pd.NA
|
||||||
|
|
||||||
|
df_coded[col_name] = col
|
||||||
|
elif col.dtype in ('bool', 'boolean', 'category', 'object'):
|
||||||
|
cat_values = col.dropna().unique()
|
||||||
|
|
||||||
|
if len(cat_values) == 2:
|
||||||
|
# Categorical variable with 2 categories
|
||||||
|
# Code as 0/1/NA
|
||||||
|
cat_values = sorted(cat_values)
|
||||||
|
col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
|
||||||
|
df_coded[col_name] = col
|
||||||
|
else:
|
||||||
|
# Categorical variable with >2 categories
|
||||||
|
# Create dummy variables
|
||||||
|
dummies = pd.get_dummies(col, prefix=col_name)
|
||||||
|
df_coded = df_coded.join(dummies)
|
||||||
|
else:
|
||||||
|
# Numeric variable, etc.
|
||||||
|
df_coded[col_name] = col
|
||||||
|
|
||||||
|
# Compute pairwise correlation
|
||||||
|
df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')
|
||||||
|
|
||||||
|
for i, col1 in enumerate(df_coded.columns):
|
||||||
|
for col2 in df_coded.columns[:i]:
|
||||||
|
statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
|
||||||
|
df_corr.loc[col1, col2] = statistic
|
||||||
|
df_corr.loc[col2, col1] = statistic
|
||||||
|
|
||||||
|
# Correlation with itself is always 1
|
||||||
|
df_corr.loc[col1, col1] = 1
|
||||||
|
|
||||||
|
return AutoCorrelationsResult(df_corr)
|
||||||
|
|
||||||
|
class AutoCorrelationsResult:
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
def __init__(self, correlations):
|
||||||
|
self.correlations = correlations
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
if config.repr_is_summary:
|
||||||
|
return self.summary()
|
||||||
|
return super().__repr__()
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
df_repr = self.correlations._repr_html_()
|
||||||
|
|
||||||
|
# Insert caption
|
||||||
|
idx_endopen = df_repr.index('>', df_repr.index('<table'))
|
||||||
|
df_repr = df_repr[:idx_endopen+1] + '<caption>Correlation Matrix</caption>' + df_repr[idx_endopen+1:]
|
||||||
|
|
||||||
|
return df_repr
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the correlation matrix
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
return 'Correlation Matrix\n\n' + str(self.correlations)
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')
|
||||||
|
Loading…
Reference in New Issue
Block a user