d76c831247
Nuevo dominio cpp/functions/datascience con primitivas puras CPU para post- proceso de samples Monte Carlo y diagnostico de cadenas MCMC. Diseñadas como gemelas CPU de los kernels GPU (rng pareja con gpu_rng_glsl, MH 1D/ND con mc_metropolis_hastings_gpu) para validar numericamente y para datasets pequeños donde el dispatch GPU no compensa. - rng: xoshiro256++ con uniform / normal (Box-Muller) / below (Lemire) / categorical. Determinista bit-exacto dado seed. - stats_summary: sum (Kahan), mean, var/std (Welford one-pass), min, max, quantile / percentile (R type-7). - autocorr: r(k), ACF, tau_int (Sokal) — diagnostico ACF y ESS. - rhat_ess: Gelman-Rubin clasico y split + ESS basico (multi-chain). - beta_dist: lgamma (Lanczos), beta_pdf, beta_cdf (continued fraction), beta_quantile, mean/var/std — para inferencia Beta-Binomial. - drawdown: max_dd absoluto/pct + underwater series para sesiones simuladas y backtests. - samples_to_grid_2d: binning 2D CPU para alimentar heatmap_cpp_viz / contour_cpp_viz desde samples (x[], y[]). - metropolis_hastings: MH 1D y ND con target log-pdf como std::function (no normalizada). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
89 lines
2.7 KiB
C++
89 lines
2.7 KiB
C++
#include "datascience/rhat_ess.h"
|
|
#include "datascience/autocorr.h"
|
|
|
|
#include <cmath>
|
|
#include <vector>
|
|
|
|
namespace fn::ds {
|
|
|
|
static double rhat_core(const double* chains, std::size_t m, std::size_t n) {
|
|
if (chains == nullptr || m < 2 || n < 2) return 1.0;
|
|
|
|
// Mean por cadena.
|
|
std::vector<double> means(m, 0.0);
|
|
for (std::size_t j = 0; j < m; ++j) {
|
|
const double* c = chains + j * n;
|
|
double s = 0.0;
|
|
for (std::size_t i = 0; i < n; ++i) s += c[i];
|
|
means[j] = s / static_cast<double>(n);
|
|
}
|
|
|
|
// Grand mean.
|
|
double grand = 0.0;
|
|
for (std::size_t j = 0; j < m; ++j) grand += means[j];
|
|
grand /= static_cast<double>(m);
|
|
|
|
// Between-chain variance B.
|
|
double B = 0.0;
|
|
for (std::size_t j = 0; j < m; ++j) {
|
|
double d = means[j] - grand;
|
|
B += d * d;
|
|
}
|
|
B *= static_cast<double>(n) / static_cast<double>(m - 1);
|
|
|
|
// Within-chain variance W (promedio de las varianzas muestrales).
|
|
double W = 0.0;
|
|
for (std::size_t j = 0; j < m; ++j) {
|
|
const double* c = chains + j * n;
|
|
double s = 0.0;
|
|
for (std::size_t i = 0; i < n; ++i) {
|
|
double d = c[i] - means[j];
|
|
s += d * d;
|
|
}
|
|
W += s / static_cast<double>(n - 1);
|
|
}
|
|
W /= static_cast<double>(m);
|
|
|
|
if (W <= 0.0) return 1.0;
|
|
|
|
double n_d = static_cast<double>(n);
|
|
double var_hat = ((n_d - 1.0) * W + B) / n_d;
|
|
return std::sqrt(var_hat / W);
|
|
}
|
|
|
|
double rhat(const double* chains, std::size_t m, std::size_t n) {
|
|
return rhat_core(chains, m, n);
|
|
}
|
|
|
|
double rhat_split(const double* chains, std::size_t m, std::size_t n) {
|
|
if (chains == nullptr || m < 1 || n < 4) return 1.0;
|
|
std::size_t half = n / 2;
|
|
std::size_t m2 = m * 2;
|
|
// Reorganizar a (2m, half) row-major. Copia explicita: la segunda mitad
|
|
// de cada cadena no es contigua a la primera.
|
|
std::vector<double> split(m2 * half);
|
|
for (std::size_t j = 0; j < m; ++j) {
|
|
const double* c = chains + j * n;
|
|
double* a = split.data() + (2 * j) * half;
|
|
double* b = split.data() + (2 * j + 1) * half;
|
|
for (std::size_t i = 0; i < half; ++i) a[i] = c[i];
|
|
for (std::size_t i = 0; i < half; ++i) b[i] = c[half + i];
|
|
}
|
|
return rhat_core(split.data(), m2, half);
|
|
}
|
|
|
|
double ess_basic(const double* chains, std::size_t m, std::size_t n,
|
|
std::size_t max_lag, double cutoff) {
|
|
if (chains == nullptr || m == 0 || n < 2) return 0.0;
|
|
double total = 0.0;
|
|
for (std::size_t j = 0; j < m; ++j) {
|
|
const double* c = chains + j * n;
|
|
double tau = autocorr_tau(c, n, max_lag, cutoff);
|
|
if (tau < 1.0) tau = 1.0;
|
|
total += static_cast<double>(n) / tau;
|
|
}
|
|
return total;
|
|
}
|
|
|
|
} // namespace fn::ds
|