From cbb79207aeb1bf099c291bd57a1b69448e3d477c Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Fri, 7 Oct 2022 16:52:51 +1100 Subject: [PATCH] Implement beta_oddsratio --- tests/test_beta_oddsratio.py | 94 +++++++++++++++++++++++++++ tests/test_beta_ratio.py | 33 ++++++++-- yli/distributions.py | 119 +++++++++++++++++++++++++++++++++-- 3 files changed, 235 insertions(+), 11 deletions(-) create mode 100644 tests/test_beta_oddsratio.py diff --git a/tests/test_beta_oddsratio.py b/tests/test_beta_oddsratio.py new file mode 100644 index 0000000..223394a --- /dev/null +++ b/tests/test_beta_oddsratio.py @@ -0,0 +1,94 @@ +# scipy-yli: Helpful SciPy utilities and recipes +# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from scipy import stats + +import yli + +def test_beta_ratio_cdf_vs_empirical(): + """Compare beta_ratio.cdf with empirical distribution""" + + # Define the example beta distribution + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_oddsratio.from_scipy(beta1, beta2) + + # Compute empirical distribution + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = (samples_p1 / (1 - samples_p1)) / (samples_p2 / (1 - samples_p2)) + + # Values to check + x = np.linspace(0, 2, 10) + y1 = dist.cdf(x) + y2 = [(sample < xx).sum()/sample.size for xx in x] + + # Allow 0.01 tolerance + assert (np.abs(y1 - y2) < 0.01).all() + +def test_beta_ratio_ppf_vs_empirical(): + """Compare beta_ratio.ppf with empirical distribution""" + + # Define the example beta distribution + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_oddsratio.from_scipy(beta1, beta2) + + # Compute empirical distribution + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = (samples_p1 / (1 - samples_p1)) / (samples_p2 / (1 - samples_p2)) + + # Values to check + x = np.linspace(0, 0.9, 10) + y1 = dist.ppf(x) + y2 = np.quantile(sample, x) + + # Allow 0.01 tolerance + assert (np.abs(y1 - y2) < 0.01).all() + +def test_beta_ratio_mean_vs_empirical(): + """Compare beta_ratio.mean (vs _munp) with empirical mean""" + + # Define the example beta distribution + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_oddsratio.from_scipy(beta1, beta2) + + # Compute empirical mean + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = (samples_p1 / (1 - samples_p1)) / (samples_p2 / (1 - samples_p2)) + + # Allow 0.01 tolerance + assert np.abs(dist.mean() - sample.mean()) < 0.01 + +def test_beta_ratio_var_vs_empirical(): + """Compare beta_ratio.var (vs _munp) with empirical variance""" + + # Define the example beta distribution + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_oddsratio.from_scipy(beta1, beta2) + + # Compute empirical mean + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = (samples_p1 / (1 - samples_p1)) / (samples_p2 / (1 - samples_p2)) + + # Allow 0.01 tolerance + assert np.abs(dist.var() - sample.var()) < 0.01 diff --git a/tests/test_beta_ratio.py b/tests/test_beta_ratio.py index 605bfb4..ad4baf8 100644 --- a/tests/test_beta_ratio.py +++ b/tests/test_beta_ratio.py @@ -15,6 +15,7 @@ # along with this program. If not, see . import numpy as np +from scipy import stats import yli @@ -30,7 +31,7 @@ def test_beta_ratio_vs_jsaffer_pdf(): y = dist.pdf(x) # Compare with expected values from jsaffer implementation - expected = np.load('beta_ratio_vs_jsaffer.npy', allow_pickle=False)[0] + expected = np.load('tests/beta_ratio_vs_jsaffer.npy', allow_pickle=False)[0] # Allow 1e-10 tolerance diff = np.abs(y - expected) @@ -48,7 +49,7 @@ def test_beta_ratio_vs_jsaffer_cdf(): y = dist.cdf(x) # Compare with expected values from jsaffer implementation - expected = np.load('beta_ratio_vs_jsaffer.npy', allow_pickle=False)[1] + expected = np.load('tests/beta_ratio_vs_jsaffer.npy', allow_pickle=False)[1] # Allow 1e-10 tolerance diff = np.abs(y - expected) @@ -65,16 +66,36 @@ def _gen_beta_ratio_vs_jsaffer(): y1 = np.vectorize(lambda w: float(beta_quotient_distribution.pdf_bb_ratio(a1, a2, b1, b2, w)))(x) y2 = np.vectorize(lambda w: float(beta_quotient_distribution.cdf_bb_ratio(a1, a2, b1, b2, w)))(x) - np.save('beta_ratio_vs_jsaffer.npy', np.array([y1, y2]), allow_pickle=False) + np.save('tests/beta_ratio_vs_jsaffer.npy', np.array([y1, y2]), allow_pickle=False) def test_beta_ratio_mean_vs_empirical(): """Compare beta_ratio.mean (via beta_ratio._munp) with empirical mean""" # Define the example beta distribution - dist = yli.beta_ratio(3, 6, 12, 7) + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_ratio.from_scipy(beta1, beta2) - # Compute empirical mean - sample = dist.rvs(1000, random_state=31415) + # Compute empirical distribution + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = samples_p1 / samples_p2 # Allow 0.01 tolerance assert np.abs(dist.mean() - sample.mean()) < 0.01 + +def test_beta_ratio_var_vs_empirical(): + """Compare beta_ratio.var (via beta_ratio._munp) with empirical variance""" + + # Define the example beta distribution + beta1 = stats.beta(3, 6) + beta2 = stats.beta(12, 7) + dist = yli.beta_ratio.from_scipy(beta1, beta2) + + # Compute empirical distribution + samples_p1 = beta1.rvs(10_000, random_state=31415) + samples_p2 = beta2.rvs(10_000, random_state=92653) + sample = samples_p1 / samples_p2 + + # Allow 0.01 tolerance + assert np.abs(dist.var() - sample.var()) < 0.01 diff --git a/yli/distributions.py b/yli/distributions.py index 3a8f225..e1451df 100644 --- a/yli/distributions.py +++ b/yli/distributions.py @@ -16,7 +16,7 @@ import mpmath import numpy as np -from scipy import optimize, stats +from scipy import integrate, optimize, stats import collections @@ -30,6 +30,8 @@ class betarat_gen(stats.rv_continuous): 2. Weekend Editor. On the ratio of Beta-distributed random variables. Some Weekend Reading. 2021 Sep 13. https://www.someweekendreading.blog/beta-ratios/ """ + # TODO: _rvs based on stats.beta + def from_scipy(self, beta1, beta2): """Construct a new beta_ratio distribution from two SciPy beta distributions""" return self(*beta1.args, *beta2.args) @@ -84,14 +86,121 @@ class betarat_gen(stats.rv_continuous): def _munp_one(self, k, a1, b1, a2, b2): """Moments of the distribution, given by Weekend Editor""" - term1 = mpmath.rf(a1, k) / mpmath.rf(a1 + b1, k) - term2 = mpmath.rf(a2 + b2 - k, k) / mpmath.rf(a2 - k, k) - return float(term1 * term2) + term1 = mpmath.rf(a1, k) * mpmath.rf(a2 + b2 - k, k) + term2 = mpmath.rf(a1 + b1, k) * mpmath.rf(a2 - k, k) + return float(term1 / term2) def _munp(self, k, a1, b1, a2, b2): return self._do_vectorized(self._munp_one, k, a1, b1, a2, b2) -beta_ratio = betarat_gen(name='beta_ratio', a=0) +beta_ratio = betarat_gen(name='beta_ratio', a=0) # a = lower bound of support + +class betaoddsrat_gen(stats.rv_continuous): + """ + Ratio of the odds of 2 independent beta-distributed variables + Ratio of (X/(1-X)) / (Y/(1-Y)), where X ~ Beta(a1, b1), Y ~ Beta(a2, b2) + + Reference: + Hora SC, Kelley GD. Bayesian inference on the odds and risk ratios. Communications in Statistics: Theory and Methods. 1983;12(6):725–38. doi: 10.1080/03610928308828491 + """ + + # TODO: _rvs based on stats.beta + + def from_scipy(self, beta1, beta2): + """Construct a new beta_ratio distribution from two SciPy beta distributions""" + return self(*beta1.args, *beta2.args) + + def _do_vectorized(self, func, x, a1, b1, a2, b2): + """Helper function to call the implementation over potentially multiple values""" + + x = np.atleast_1d(x) + result = np.zeros(x.size) + for i, (x_, a1_, b1_, a2_, b2_) in enumerate(zip(x, np.pad(np.atleast_1d(a1), x.size, 'edge'), np.pad(np.atleast_1d(b1), x.size, 'edge'), np.pad(np.atleast_1d(a2), x.size, 'edge'), np.pad(np.atleast_1d(b2), x.size, 'edge'))): + result[i] = func(x_, a1_, b1_, a2_, b2_) + return result + + def _pdf_one(self, w, a1, b1, a2, b2): + """PDF for the distribution, given by Hora & Kelley""" + + if w <= 0: + return 0 + elif w < 1: + term1 = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) + term2 = mpmath.power(w, a1 - 1) + term3 = mpmath.hyp2f1(a1 + b1, a1 + a2, a1 + a2 + b1 + b2, 1 - w) + else: + term1 = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) + term2 = mpmath.power(w, -a2 - 1) + term3 = mpmath.hyp2f1(a2 + b2, a1 + a2, a1 + a2 + b1 + b2, 1 - mpmath.power(w, -1)) + + return float(term1 * term2 * term3) + + def _pdf(self, w, a1, b1, a2, b2): + return self._do_vectorized(self._pdf_one, w, a1, b1, a2, b2) + + def _cdf(self, w, a1, b1, a2, b2): + """CDF for the distribution, computed by integrating the PDF""" + + w = np.atleast_1d(w) + a1 = np.atleast_1d(a1) + b1 = np.atleast_1d(b1) + a2 = np.atleast_1d(a2) + b2 = np.atleast_1d(b2) + + if not ((a1 == a1[0]).all() and (b1 == b1[0]).all() and (a2 == a2[0]).all() and (b2 == b2[0]).all()): + raise ValueError('Cannot compute CDF from different distributions') + + if w.size == 1: + if w <= 0: + return 0 + + # Just compute an integral + if w < self.mean(a1, b1, a2, b2): + # Integrate normally + return integrate.quad(lambda x: self._pdf_one(x, a1[0], b1[0], a2[0], b2[0]), 0, w)[0] + else: + # Integrate on the distribution of 1/w (much faster) + return 1 - integrate.quad(lambda x: self._pdf_one(x, a2[0], b2[0], a1[0], b1[0]), 0, 1/w)[0] + else: + # Multiple points requested: use ODE solver + solution = integrate.solve_ivp(lambda x, _: self._pdf_one(x, a1[0], b1[0], a2[0], b2[0]), (0, w.max()), [0], t_eval=w, method='LSODA', rtol=1.5e-8, atol=1.5e-8) + return solution.y + + def _ppf_one(self, p, a1, b1, a2, b2): + """PPF for the distribution, using Newton's method""" + + # Default SciPy implementation uses Brent's method + # This one is a bit faster + + if p <= 0: + return 0 + + # Initial guess based on log-normal approximation + mean = self.mean(a1, b1, a2, b2) + se_log = self.std(a1, b1, a2, b2)/mean + initial_guess = np.exp(np.log(mean) + stats.norm.ppf(p) * se_log) + + try: + return optimize.newton(lambda x: self.cdf(x, a1, b1, a2, b2) - p, x0=initial_guess, fprime=lambda x: self.pdf(x, a1, b1, a2, b2)) + except RuntimeError: + # Failed to converge with Newton's method :( + # Use Brent's method instead + return super()._ppf(p, a1, b1, a2, b2) + + def _ppf(self, p, a1, b1, a2, b2): + return self._do_vectorized(self._ppf_one, p, a1, b1, a2, b2) + + def _munp_one(self, k, a1, b1, a2, b2): + """Moments of the distribution, given by Hora & Kelley""" + + term1 = mpmath.rf(a1, k) * mpmath.rf(b2, k) + term2 = mpmath.rf(b1 - k, k) * mpmath.rf(a2 - k, k) + return float(term1 / term2) + + def _munp(self, k, a1, b1, a2, b2): + return self._do_vectorized(self._munp_one, k, a1, b1, a2, b2) + +beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0) # a = lower bound of support ConfidenceInterval = collections.namedtuple('ConfidenceInterval', ['lower', 'upper'])