diff --git a/tests/test_vif.py b/tests/test_vif.py
new file mode 100644
index 0000000..1f48047
--- /dev/null
+++ b/tests/test_vif.py
@@ -0,0 +1,51 @@
+# scipy-yli: Helpful SciPy utilities and recipes
+# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from pytest import approx
+
+import pandas as pd
+
+import yli
+
+def test_vif_ol13_5():
+ """Compare yli.vif for Ott & Longnecker (2016) chapter 13.5"""
+
+ df = pd.DataFrame({
+ 'C': [460.05, 452.99, 443.22, 652.32, 642.23, 345.39, 272.37, 317.21, 457.12, 690.19, 350.63, 402.59, 412.18, 495.58, 394.36, 423.32, 712.27, 289.66, 881.24, 490.88, 567.79, 665.99, 621.45, 608.8, 473.64, 697.14, 207.51, 288.48, 284.88, 280.36, 217.38, 270.71],
+ 'D': [68.58, 67.33, 67.33, 68, 68, 67.92, 68.17, 68.42, 68.42, 68.33, 68.58, 68.75, 68.42, 68.92, 68.92, 68.42, 69.5, 68.42, 69.17, 68.92, 68.75, 70.92, 69.67, 70.08, 70.42, 71.08, 67.25, 67.17, 67.83, 67.83, 67.25, 67.83],
+ 'T1': [14, 10, 10, 11, 11, 13, 12, 14, 15, 12, 12, 13, 15, 17, 13, 11, 18, 15, 15, 16, 11, 22, 16, 19, 19, 20, 13, 9, 12, 12, 13, 7],
+ 'T2': [46, 73, 85, 67, 78, 51, 50, 59, 55, 71, 64, 47, 62, 52, 65, 67, 60, 76, 67, 59, 70, 57, 59, 58, 44, 57, 63, 48, 63, 71, 72, 80],
+ 'S': [687, 1065, 1065, 1065, 1065, 514, 822, 457, 822, 792, 560, 790, 530, 1050, 850, 778, 845, 530, 1090, 1050, 913, 828, 786, 821, 538, 1130, 745, 821, 886, 886, 745, 886],
+ 'PR': [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1],
+ 'NE': [1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'CT': [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
+ 'BW': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
+ 'N': [14, 1, 1, 12, 12, 3, 5, 1, 5, 2, 3, 6, 2, 7, 16, 3, 17, 2, 1, 8, 15, 20, 18, 3, 19, 21, 8, 7, 11, 11, 8, 11],
+ 'PT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+ })
+
+ vifs = yli.vif(df[['D', 'T1', 'T2', 'S', 'PR', 'NE', 'CT', 'BW', 'N', 'PT']])
+
+ assert vifs['D'] == approx(8.31830, abs=0.00001)
+ assert vifs['T1'] == approx(6.08159, abs=0.00001)
+ assert vifs['T2'] == approx(2.45712, abs=0.00001)
+ assert vifs['S'] == approx(1.26727, abs=0.00001)
+ assert vifs['PR'] == approx(1.66568, abs=0.00001)
+ assert vifs['NE'] == approx(1.30924, abs=0.00001)
+ assert vifs['CT'] == approx(1.32422, abs=0.00001)
+ assert vifs['BW'] == approx(1.91292, abs=0.00001)
+ assert vifs['N'] == approx(2.64429, abs=0.00001)
+ assert vifs['PT'] == approx(2.88092, abs=0.00001)
diff --git a/yli/__init__.py b/yli/__init__.py
index 071c7e0..8b97b9c 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -16,7 +16,7 @@
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import regress
+from .regress import regress, vif
from .sig_tests import chi2, mannwhitney, ttest_ind
def reload_me():
diff --git a/yli/regress.py b/yli/regress.py
index 2a04a1a..051a6ec 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -15,16 +15,39 @@
# along with this program. If not, see .
import numpy as np
+import pandas as pd
import patsy
from scipy import stats
import statsmodels.api as sm
from statsmodels.iolib.table import SimpleTable
+from statsmodels.stats.outliers_influence import variance_inflation_factor
from datetime import datetime
import itertools
from .utils import Estimate, check_nan, fmt_p_html, fmt_p_text
+def vif(df, nan_policy='warn'):
+ """Calculate the variance inflation factor for each variable in df"""
+
+ # Check for/clean NaNs
+ df = check_nan(df, nan_policy)
+
+ # Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..."
+ df = pd.get_dummies(df, drop_first=True) # Convert categorical dtypes
+ df = df.astype('float64') # Convert all other dtypes
+
+ # Add intercept column
+ orig_columns = list(df.columns)
+ df['Intercept'] = [1] * len(df)
+
+ vifs = {}
+
+ for i, col in enumerate(orig_columns):
+ vifs[col] = variance_inflation_factor(df, i)
+
+ return pd.Series(vifs)
+
def cols_for_formula(formula):
"""Return the columns corresponding to the Patsy formula"""