# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import mpmath import numpy as np from scipy import integrate, optimize, stats import collections class betarat_gen(stats.rv_continuous): """ Ratio of 2 independent beta-distributed variables Ratio of Beta(a1, b1) / Beta(a2, b2) References: 1. Pham-Gia T. Distributions of the ratios of independent beta variables and applications. Communications in Statistics: Theory and Methods. 2000;29(12):2693–715. doi: 10.1080/03610920008832632 2. Weekend Editor. On the ratio of Beta-distributed random variables. Some Weekend Reading. 2021 Sep 13. https://www.someweekendreading.blog/beta-ratios/ """ # TODO: _rvs based on stats.beta def from_scipy(self, beta1, beta2): """Construct a new beta_ratio distribution from two SciPy beta distributions""" return self(*beta1.args, *beta2.args) def _do_vectorized(self, func, x, a1, b1, a2, b2): """Helper function to call the implementation over potentially multiple values""" x = np.atleast_1d(x) result = np.zeros(x.size) for i, (x_, a1_, b1_, a2_, b2_) in enumerate(zip(x, np.pad(a1, x.size, 'edge'), np.pad(b1, x.size, 'edge'), np.pad(a2, x.size, 'edge'), np.pad(b2, x.size, 'edge'))): result[i] = func(x_, a1_, b1_, a2_, b2_) return result def _pdf_one(self, w, a1, b1, a2, b2): """PDF for the distribution, given by Pham-Gia""" if w <= 0: return 0 elif w < 1: term1 = mpmath.beta(a1 + a2, b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = mpmath.power(w, a1 - 1) term3 = mpmath.hyp2f1(a1 + a2, 1 - b1, a1 + a2 + b2, w) else: term1 = mpmath.beta(a1 + a2, b1) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = 1 / mpmath.power(w, a2 + 1) term3 = mpmath.hyp2f1(a1 + a2, 1 - b2, a1 + a2 + b1, 1/w) return float(term1 * term2 * term3) def _pdf(self, w, a1, b1, a2, b2): return self._do_vectorized(self._pdf_one, w, a1, b1, a2, b2) def _cdf_one(self, w, a1, b1, a2, b2): """PDF for the distribution, given by Pham-Gia""" if w <= 0: return 0 elif w < 1: term1 = mpmath.beta(a1 + a2, b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = mpmath.power(w, a1) / a1 term3 = mpmath.hyper([a1, a1 + a2, 1 - b1], [a1 + 1, a1 + a2 + b2], w) return float(term1 * term2 * term3) else: term1 = mpmath.beta(a1 + a2, b1) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = 1 / (a2 * mpmath.power(w, a2)) term3 = mpmath.hyper([a2, a1 + a2, 1 - b2], [a2 + 1, a1 + a2 + b1], 1/w) return 1 - float(term1 * term2 * term3) def _cdf(self, w, a1, b1, a2, b2): return self._do_vectorized(self._cdf_one, w, a1, b1, a2, b2) def _munp_one(self, k, a1, b1, a2, b2): """Moments of the distribution, given by Weekend Editor""" term1 = mpmath.rf(a1, k) * mpmath.rf(a2 + b2 - k, k) term2 = mpmath.rf(a1 + b1, k) * mpmath.rf(a2 - k, k) return float(term1 / term2) def _munp(self, k, a1, b1, a2, b2): return self._do_vectorized(self._munp_one, k, a1, b1, a2, b2) beta_ratio = betarat_gen(name='beta_ratio', a=0) # a = lower bound of support class betaoddsrat_gen(stats.rv_continuous): """ Ratio of the odds of 2 independent beta-distributed variables Ratio of (X/(1-X)) / (Y/(1-Y)), where X ~ Beta(a1, b1), Y ~ Beta(a2, b2) Reference: Hora SC, Kelley GD. Bayesian inference on the odds and risk ratios. Communications in Statistics: Theory and Methods. 1983;12(6):725–38. doi: 10.1080/03610928308828491 """ # TODO: _rvs based on stats.beta def from_scipy(self, beta1, beta2): """Construct a new beta_ratio distribution from two SciPy beta distributions""" return self(*beta1.args, *beta2.args) def set_cdf_terms(self, cdf_terms): """Set the number of terms to use when calculating CDF (see _cdf)""" BETA_ODDSRATIO_CDF_TERMS[0] = cdf_terms def _do_vectorized(self, func, x, a1, b1, a2, b2): """Helper function to call the implementation over potentially multiple values""" x = np.atleast_1d(x) result = np.zeros(x.size) for i, (x_, a1_, b1_, a2_, b2_) in enumerate(zip(x, np.pad(np.atleast_1d(a1), x.size, 'edge'), np.pad(np.atleast_1d(b1), x.size, 'edge'), np.pad(np.atleast_1d(a2), x.size, 'edge'), np.pad(np.atleast_1d(b2), x.size, 'edge'))): result[i] = func(x_, a1_, b1_, a2_, b2_) return result def _pdf_one(self, w, a1, b1, a2, b2): """PDF for the distribution, given by Hora & Kelley""" if w <= 0: return 0 elif w < 1: term1 = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = mpmath.power(w, a1 - 1) term3 = mpmath.hyp2f1(a1 + b1, a1 + a2, a1 + a2 + b1 + b2, 1 - w) else: term1 = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) term2 = mpmath.power(w, -a2 - 1) term3 = mpmath.hyp2f1(a2 + b2, a1 + a2, a1 + a2 + b1 + b2, 1 - mpmath.power(w, -1)) return float(term1 * term2 * term3) def _pdf(self, w, a1, b1, a2, b2): return self._do_vectorized(self._pdf_one, w, a1, b1, a2, b2) def _cdf_one_infsum(self, w, a1, b1, a2, b2): """CDF for the distribution, by truncating infinite sum given by Hora & Kelly""" if w <= 0: return 0 elif w < 1: k = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) inf_sum = 0 for j in range(0, BETA_ODDSRATIO_CDF_TERMS[0]): kj1 = mpmath.rf(a1 + b1, j) * mpmath.rf(a1 + a2, j) kj2 = mpmath.rf(a1 + a2 + b1 + b2, j) * mpmath.factorial(j) inf_sum += kj1/kj2 * mpmath.betainc(a1, j + 1, 0, w) return k * inf_sum else: k = mpmath.beta(a1 + a2, b1 + b2) / (mpmath.beta(a1, b1) * mpmath.beta(a2, b2)) inf_sum = 0 for j in range(0, BETA_ODDSRATIO_CDF_TERMS[0]): kj1 = mpmath.rf(a2 + b2, j) * mpmath.rf(a1 + a2, j) kj2 = mpmath.rf(a1 + a2 + b1 + b2, j) * mpmath.factorial(j) inf_sum += kj1/kj2 * mpmath.betainc(a2, j + 1, 0, mpmath.power(w, -1)) return 1 - k * inf_sum def _cdf(self, w, a1, b1, a2, b2): """ CDF for the distribution If BETA_ODDSRATIO_CDF_TERMS = np.inf, compute the CDF by integrating the PDF Otherwise, compute the CDF by truncating the infinite sum given by Hora & Kelley """ w = np.atleast_1d(w) a1 = np.atleast_1d(a1) b1 = np.atleast_1d(b1) a2 = np.atleast_1d(a2) b2 = np.atleast_1d(b2) if not ((a1 == a1[0]).all() and (b1 == b1[0]).all() and (a2 == a2[0]).all() and (b2 == b2[0]).all()): raise ValueError('Cannot compute CDF from different distributions') if BETA_ODDSRATIO_CDF_TERMS[0] == np.inf: # Exact solution requested if w.size == 1: if w <= 0: return 0 # Just compute an integral if w < self.mean(a1, b1, a2, b2): # Integrate normally return integrate.quad(lambda x: self._pdf_one(x, a1[0], b1[0], a2[0], b2[0]), 0, w)[0] else: # Integrate on the distribution of 1/w (much faster) return 1 - integrate.quad(lambda x: self._pdf_one(x, a2[0], b2[0], a1[0], b1[0]), 0, 1/w)[0] else: # Multiple points requested: use ODE solver solution = integrate.solve_ivp(lambda x, _: self._pdf_one(x, a1[0], b1[0], a2[0], b2[0]), (0, w.max()), [0], t_eval=w, method='LSODA', rtol=1.5e-8, atol=1.5e-8) return solution.y else: # Truncate infinite sum return self._do_vectorized(self._cdf_one_infsum, w, a1, b1, a2, b2) def _ppf_one(self, p, a1, b1, a2, b2): """PPF for the distribution, using Newton's method""" # Default SciPy implementation uses Brent's method # This one is a bit faster if p <= 0: return 0 # Initial guess based on log-normal approximation mean = self.mean(a1, b1, a2, b2) se_log = self.std(a1, b1, a2, b2)/mean initial_guess = np.exp(np.log(mean) + stats.norm.ppf(p) * se_log) try: return optimize.newton(lambda x: self.cdf(x, a1, b1, a2, b2) - p, x0=initial_guess, fprime=lambda x: self.pdf(x, a1, b1, a2, b2)) except RuntimeError: # Failed to converge with Newton's method :( # Use Brent's method instead return super()._ppf(p, a1, b1, a2, b2) def _ppf(self, p, a1, b1, a2, b2): return self._do_vectorized(self._ppf_one, p, a1, b1, a2, b2) def _munp_one(self, k, a1, b1, a2, b2): """Moments of the distribution, given by Hora & Kelley""" term1 = mpmath.rf(a1, k) * mpmath.rf(b2, k) term2 = mpmath.rf(b1 - k, k) * mpmath.rf(a2 - k, k) return float(term1 / term2) def _munp(self, k, a1, b1, a2, b2): return self._do_vectorized(self._munp_one, k, a1, b1, a2, b2) # See beta_oddsratio.cdf BETA_ODDSRATIO_CDF_TERMS = [100] beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0, shapes='a1,b1,a2,b2') # a = lower bound of support ConfidenceInterval = collections.namedtuple('ConfidenceInterval', ['lower', 'upper']) def hdi(distribution, level=0.95): """ Get the highest density interval for the distribution, e.g. for a Bayesian posterior, the highest posterior density interval (HPD/HDI) """ # For a given lower limit, we can compute the corresponding 95% interval def interval_width(lower): upper = distribution.ppf(distribution.cdf(lower) + level) return upper - lower # Find such interval which has the smallest width # Use equal-tailed interval as initial guess initial_guess = distribution.ppf((1-level)/2) optimize_result = optimize.minimize(interval_width, initial_guess) # TODO: Allow customising parameters lower_limit = optimize_result.x[0] width = optimize_result.fun upper_limit = lower_limit + width return ConfidenceInterval(lower_limit, upper_limit)