Files
fn_registry/cpp/functions/datascience/beta_dist.cpp
egutierrez d115d8e830 feat(cpp/datascience): CPU stats + MCMC primitives
Nuevo dominio cpp/functions/datascience con primitivas puras CPU para post-
proceso de samples Monte Carlo y diagnostico de cadenas MCMC. Diseñadas como
gemelas CPU de los kernels GPU (rng pareja con gpu_rng_glsl, MH 1D/ND con
mc_metropolis_hastings_gpu) para validar numericamente y para datasets
pequeños donde el dispatch GPU no compensa.

- rng: xoshiro256++ con uniform / normal (Box-Muller) / below (Lemire) /
  categorical. Determinista bit-exacto dado seed.
- stats_summary: sum (Kahan), mean, var/std (Welford one-pass), min, max,
  quantile / percentile (R type-7).
- autocorr: r(k), ACF, tau_int (Sokal) — diagnostico ACF y ESS.
- rhat_ess: Gelman-Rubin clasico y split + ESS basico (multi-chain).
- beta_dist: lgamma (Lanczos), beta_pdf, beta_cdf (continued fraction),
  beta_quantile, mean/var/std — para inferencia Beta-Binomial.
- drawdown: max_dd absoluto/pct + underwater series para sesiones
  simuladas y backtests.
- samples_to_grid_2d: binning 2D CPU para alimentar heatmap_cpp_viz /
  contour_cpp_viz desde samples (x[], y[]).
- metropolis_hastings: MH 1D y ND con target log-pdf como std::function
  (no normalizada).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 11:52:26 +02:00

134 lines
3.9 KiB
C++

#include "datascience/beta_dist.h"
#include <cmath>
namespace fn::ds {
// Lanczos approx with g=7, n=9 — precision ~1e-15 sobre x > 0.5. Para
// x <= 0.5 usa la reflexion lgamma(x) = log(pi/sin(pi*x)) - lgamma(1-x).
double lgamma_lanczos(double x) {
static const double p[] = {
676.5203681218851,
-1259.1392167224028,
771.32342877765313,
-176.61502916214059,
12.507343278686905,
-0.13857109526572012,
9.9843695780195716e-6,
1.5056327351493116e-7
};
if (x < 0.5) {
return std::log(3.14159265358979323846 /
std::sin(3.14159265358979323846 * x))
- lgamma_lanczos(1.0 - x);
}
x -= 1.0;
double a = 0.99999999999980993;
double t = x + 7.5;
for (int i = 0; i < 8; ++i) {
a += p[i] / (x + static_cast<double>(i + 1));
}
return 0.5 * std::log(6.28318530717958647692) +
(x + 0.5) * std::log(t) - t + std::log(a);
}
double log_beta(double a, double b) {
return lgamma_lanczos(a) + lgamma_lanczos(b) - lgamma_lanczos(a + b);
}
double beta_pdf(double x, double a, double b) {
if (a <= 0.0 || b <= 0.0) return 0.0;
if (x <= 0.0 || x >= 1.0) return 0.0;
double lp = (a - 1.0) * std::log(x) +
(b - 1.0) * std::log(1.0 - x) -
log_beta(a, b);
return std::exp(lp);
}
// Continued fraction expansion de I_x(a, b). Numerical Recipes 6.4.
static double betacf(double x, double a, double b) {
constexpr int MAXIT = 200;
constexpr double EPS = 3.0e-15;
constexpr double FPMIN = 1.0e-300;
double qab = a + b;
double qap = a + 1.0;
double qam = a - 1.0;
double c = 1.0;
double d = 1.0 - qab * x / qap;
if (std::fabs(d) < FPMIN) d = FPMIN;
d = 1.0 / d;
double h = d;
for (int m = 1; m <= MAXIT; ++m) {
int m2 = 2 * m;
double aa = static_cast<double>(m) * (b - m) * x /
((qam + m2) * (a + m2));
d = 1.0 + aa * d;
if (std::fabs(d) < FPMIN) d = FPMIN;
c = 1.0 + aa / c;
if (std::fabs(c) < FPMIN) c = FPMIN;
d = 1.0 / d;
h *= d * c;
aa = -(a + m) * (qab + m) * x /
((a + m2) * (qap + m2));
d = 1.0 + aa * d;
if (std::fabs(d) < FPMIN) d = FPMIN;
c = 1.0 + aa / c;
if (std::fabs(c) < FPMIN) c = FPMIN;
d = 1.0 / d;
double del = d * c;
h *= del;
if (std::fabs(del - 1.0) < EPS) break;
}
return h;
}
double beta_cdf(double x, double a, double b) {
if (a <= 0.0 || b <= 0.0) return 0.0;
if (x <= 0.0) return 0.0;
if (x >= 1.0) return 1.0;
double bt = std::exp(lgamma_lanczos(a + b) -
lgamma_lanczos(a) -
lgamma_lanczos(b) +
a * std::log(x) +
b * std::log(1.0 - x));
if (x < (a + 1.0) / (a + b + 2.0)) {
return bt * betacf(x, a, b) / a;
}
return 1.0 - bt * betacf(1.0 - x, b, a) / b;
}
double beta_quantile(double p, double a, double b) {
if (a <= 0.0 || b <= 0.0) return 0.0;
if (p <= 0.0) return 0.0;
if (p >= 1.0) return 1.0;
// Bisection robusto. Newton tendria mejor convergencia pero requiere
// beta_pdf(x) que cerca de 0 o 1 puede oscilar; bisection en [0,1] es
// siempre estable. ~50 iter para 1e-6.
double lo = 0.0, hi = 1.0;
for (int i = 0; i < 60; ++i) {
double mid = 0.5 * (lo + hi);
double cdf = beta_cdf(mid, a, b);
if (cdf < p) lo = mid;
else hi = mid;
if (hi - lo < 1e-7) break;
}
return 0.5 * (lo + hi);
}
double beta_mean(double a, double b) {
if (a + b <= 0.0) return 0.5;
return a / (a + b);
}
double beta_variance(double a, double b) {
double s = a + b;
if (s <= 0.0 || (s + 1.0) <= 0.0) return 0.0;
return (a * b) / (s * s * (s + 1.0));
}
double beta_std(double a, double b) {
return std::sqrt(beta_variance(a, b));
}
} // namespace fn::ds