diff --git a/cpp/functions/datascience/autocorr.cpp b/cpp/functions/datascience/autocorr.cpp new file mode 100644 index 00000000..04a28803 --- /dev/null +++ b/cpp/functions/datascience/autocorr.cpp @@ -0,0 +1,88 @@ +#include "datascience/autocorr.h" + +#include + +namespace fn::ds { + +static double mean_inline(const double* x, std::size_t n) { + if (n == 0) return 0.0; + double s = 0.0; + for (std::size_t i = 0; i < n; ++i) s += x[i]; + return s / static_cast(n); +} + +static double var_inline(const double* x, std::size_t n, double mu) { + if (n == 0) return 0.0; + double s = 0.0; + for (std::size_t i = 0; i < n; ++i) { + double d = x[i] - mu; + s += d * d; + } + return s / static_cast(n); +} + +double autocorr_lag(const double* x, std::size_t n, std::size_t k) { + if (x == nullptr || n <= 1 || k >= n) return 0.0; + double mu = mean_inline(x, n); + double var = var_inline(x, n, mu); + if (var <= 0.0) return (k == 0 ? 0.0 : 0.0); + + double cov = 0.0; + std::size_t m = n - k; + for (std::size_t i = 0; i < m; ++i) { + cov += (x[i] - mu) * (x[i + k] - mu); + } + cov /= static_cast(m); + return cov / var; +} + +void autocorr_acf(const double* x, std::size_t n, + std::size_t max_lag, double* out) { + if (out == nullptr || max_lag == 0) return; + if (x == nullptr || n <= 1) { + for (std::size_t k = 0; k < max_lag; ++k) out[k] = 0.0; + return; + } + double mu = mean_inline(x, n); + double var = var_inline(x, n, mu); + if (var <= 0.0) { + for (std::size_t k = 0; k < max_lag; ++k) out[k] = 0.0; + return; + } + for (std::size_t k = 0; k < max_lag; ++k) { + if (k >= n) { out[k] = 0.0; continue; } + double cov = 0.0; + std::size_t m = n - k; + for (std::size_t i = 0; i < m; ++i) { + cov += (x[i] - mu) * (x[i + k] - mu); + } + cov /= static_cast(m); + out[k] = cov / var; + } +} + +double autocorr_tau(const double* x, std::size_t n, + std::size_t max_lag, double cutoff) { + if (x == nullptr || n <= 1) return 1.0; + if (max_lag == 0) max_lag = 1; + double mu = mean_inline(x, n); + double var = var_inline(x, n, mu); + if (var <= 0.0) return 1.0; + + double sum = 0.0; + std::size_t kmax = (max_lag < n ? max_lag : n - 1); + for (std::size_t k = 1; k < kmax; ++k) { + double cov = 0.0; + std::size_t m = n - k; + for (std::size_t i = 0; i < m; ++i) { + cov += (x[i] - mu) * (x[i + k] - mu); + } + cov /= static_cast(m); + double r = cov / var; + if (std::fabs(r) < cutoff) break; + sum += r; + } + return 1.0 + 2.0 * sum; +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/autocorr.h b/cpp/functions/datascience/autocorr.h new file mode 100644 index 00000000..030998d6 --- /dev/null +++ b/cpp/functions/datascience/autocorr.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +namespace fn::ds { + +// Autocorrelacion lag k de la serie x[0..n). r(k) = cov(x_t, x_{t+k}) / var(x). +// Definicion clasica con normalizacion sobre la varianza global. Devuelve +// 1.0 para k=0 si la serie tiene varianza > 0; si var = 0 devuelve 0 para +// todos los k. +// +// Si k >= n o n <= 1 devuelve 0. +double autocorr_lag(const double* x, std::size_t n, std::size_t k); + +// Llena out[max_lag] con r(0), r(1), ..., r(max_lag-1). Util para ACF plots. +void autocorr_acf(const double* x, std::size_t n, + std::size_t max_lag, double* out); + +// Integrated Autocorrelation Time (tau_int) — definicion de Sokal con +// truncado automatico cuando r(k) cae bajo el umbral. tau_int = 1 + 2 * sum +// de r(k) hasta k_cutoff. Util para estimar Effective Sample Size: +// ESS = n / tau_int. +// +// max_lag actua como cota dura. Si la ACF nunca cae bajo cutoff dentro de +// max_lag, devuelve 1 + 2*sum hasta max_lag-1 (estimacion conservadora). +double autocorr_tau(const double* x, std::size_t n, + std::size_t max_lag = 200, + double cutoff = 0.05); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/autocorr.md b/cpp/functions/datascience/autocorr.md new file mode 100644 index 00000000..cfca3ab7 --- /dev/null +++ b/cpp/functions/datascience/autocorr.md @@ -0,0 +1,74 @@ +--- +name: autocorr +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "double autocorr_lag(const double* x, size_t n, size_t k); void autocorr_acf(const double* x, size_t n, size_t max_lag, double* out); double autocorr_tau(const double* x, size_t n, size_t max_lag, double cutoff)" +description: "Autocorrelacion de series temporales: r(k) por lag, ACF completa hasta max_lag, y tiempo de autocorrelacion integrado (tau_int de Sokal) para Effective Sample Size en MCMC." +tags: [autocorrelation, acf, mcmc, ess, time_series, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef, cmath] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/autocorr.cpp" +params: + - name: x + desc: "Serie temporal (cadena MCMC, balance de sesion, log-returns...)." + - name: n + desc: "Longitud de la serie." + - name: k + desc: "Lag para autocorr_lag. r(0) = 1 si var > 0." + - name: max_lag + desc: "Lag maximo. ACF emite max_lag valores; tau_int trunca aqui si nunca baja del cutoff." + - name: cutoff + desc: "(tau) Umbral |r(k)| bajo el cual se trunca la suma. Default 0.05 (recomendacion estandar)." + - name: out + desc: "(acf) buffer destino double[max_lag]." +output: "Escalar (lag, tau) o array (acf). Definicion clasica r(k) = cov(x_t, x_{t+k}) / var(x). Si var=0 devuelve 0/1 segun el caso." +--- + +# autocorr + +Autocorrelacion para diagnostico de cadenas MCMC. Las 4 calculadoras MCMC del set lo usan para detectar cuando las muestras estan demasiado correladas (= la cadena no esta explorando bien). + +## Patron de uso (ACF plot) + +```cpp +constexpr int max_lag = 40; +std::vector acf(max_lag); +fn::ds::autocorr_acf(chain.data(), chain.size(), max_lag, acf.data()); + +// Pasar a line_plot_cpp_viz, eje X = 0..max_lag-1 +fn::viz::line_plot(acf.data(), max_lag, /*...*/); +``` + +## Effective Sample Size + +```cpp +double tau = fn::ds::autocorr_tau(chain.data(), chain.size()); +double ess = static_cast(chain.size()) / tau; +// Cadena de 10000 con tau=20 -> ESS=500. Bayesian rule of thumb: ESS > 100 +// para inferencia decente; > 1000 para CIs ajustados. +``` + +## Definicion + +`r(k) = (1/(n-k)) * sum_{i=0}^{n-k-1} (x_i - mu)(x_{i+k} - mu) / var(x)` donde `var(x) = (1/n) * sum (x_i - mu)^2` (poblacional). Esta es la convencion mas comun (numpy.correlate scaled, statsmodels.acf con `unbiased=False, fft=False`). + +`tau_int = 1 + 2 * sum_{k=1}^{kmax} r(k)` con kmax = primer k tal que |r(k)| < cutoff. Es la formula de Sokal usada en MCMC diagnostics (similar a la de la libreria emcee). + +## Performance + +`autocorr_acf` es O(n * max_lag). Para cadenas de 10^5-10^6 con max_lag=40 son ~10ms — suficiente para refresh interactivo. Para max_lag mayores considerar FFT-based ACF (no incluida). + +## Notas + +- Usa la formula time-domain (sin FFT). Hasta n=10^7 con max_lag=100 es OK; mas alla, FFT. +- No corrige el sesgo small-sample. Para cadenas cortas (<200 samples) el estimador es biased — pasar a versiones unbiased si la app lo requiere. diff --git a/cpp/functions/datascience/beta_dist.cpp b/cpp/functions/datascience/beta_dist.cpp new file mode 100644 index 00000000..ca050e40 --- /dev/null +++ b/cpp/functions/datascience/beta_dist.cpp @@ -0,0 +1,133 @@ +#include "datascience/beta_dist.h" + +#include + +namespace fn::ds { + +// Lanczos approx with g=7, n=9 — precision ~1e-15 sobre x > 0.5. Para +// x <= 0.5 usa la reflexion lgamma(x) = log(pi/sin(pi*x)) - lgamma(1-x). +double lgamma_lanczos(double x) { + static const double p[] = { + 676.5203681218851, + -1259.1392167224028, + 771.32342877765313, + -176.61502916214059, + 12.507343278686905, + -0.13857109526572012, + 9.9843695780195716e-6, + 1.5056327351493116e-7 + }; + if (x < 0.5) { + return std::log(3.14159265358979323846 / + std::sin(3.14159265358979323846 * x)) + - lgamma_lanczos(1.0 - x); + } + x -= 1.0; + double a = 0.99999999999980993; + double t = x + 7.5; + for (int i = 0; i < 8; ++i) { + a += p[i] / (x + static_cast(i + 1)); + } + return 0.5 * std::log(6.28318530717958647692) + + (x + 0.5) * std::log(t) - t + std::log(a); +} + +double log_beta(double a, double b) { + return lgamma_lanczos(a) + lgamma_lanczos(b) - lgamma_lanczos(a + b); +} + +double beta_pdf(double x, double a, double b) { + if (a <= 0.0 || b <= 0.0) return 0.0; + if (x <= 0.0 || x >= 1.0) return 0.0; + double lp = (a - 1.0) * std::log(x) + + (b - 1.0) * std::log(1.0 - x) - + log_beta(a, b); + return std::exp(lp); +} + +// Continued fraction expansion de I_x(a, b). Numerical Recipes 6.4. +static double betacf(double x, double a, double b) { + constexpr int MAXIT = 200; + constexpr double EPS = 3.0e-15; + constexpr double FPMIN = 1.0e-300; + double qab = a + b; + double qap = a + 1.0; + double qam = a - 1.0; + double c = 1.0; + double d = 1.0 - qab * x / qap; + if (std::fabs(d) < FPMIN) d = FPMIN; + d = 1.0 / d; + double h = d; + for (int m = 1; m <= MAXIT; ++m) { + int m2 = 2 * m; + double aa = static_cast(m) * (b - m) * x / + ((qam + m2) * (a + m2)); + d = 1.0 + aa * d; + if (std::fabs(d) < FPMIN) d = FPMIN; + c = 1.0 + aa / c; + if (std::fabs(c) < FPMIN) c = FPMIN; + d = 1.0 / d; + h *= d * c; + aa = -(a + m) * (qab + m) * x / + ((a + m2) * (qap + m2)); + d = 1.0 + aa * d; + if (std::fabs(d) < FPMIN) d = FPMIN; + c = 1.0 + aa / c; + if (std::fabs(c) < FPMIN) c = FPMIN; + d = 1.0 / d; + double del = d * c; + h *= del; + if (std::fabs(del - 1.0) < EPS) break; + } + return h; +} + +double beta_cdf(double x, double a, double b) { + if (a <= 0.0 || b <= 0.0) return 0.0; + if (x <= 0.0) return 0.0; + if (x >= 1.0) return 1.0; + double bt = std::exp(lgamma_lanczos(a + b) - + lgamma_lanczos(a) - + lgamma_lanczos(b) + + a * std::log(x) + + b * std::log(1.0 - x)); + if (x < (a + 1.0) / (a + b + 2.0)) { + return bt * betacf(x, a, b) / a; + } + return 1.0 - bt * betacf(1.0 - x, b, a) / b; +} + +double beta_quantile(double p, double a, double b) { + if (a <= 0.0 || b <= 0.0) return 0.0; + if (p <= 0.0) return 0.0; + if (p >= 1.0) return 1.0; + // Bisection robusto. Newton tendria mejor convergencia pero requiere + // beta_pdf(x) que cerca de 0 o 1 puede oscilar; bisection en [0,1] es + // siempre estable. ~50 iter para 1e-6. + double lo = 0.0, hi = 1.0; + for (int i = 0; i < 60; ++i) { + double mid = 0.5 * (lo + hi); + double cdf = beta_cdf(mid, a, b); + if (cdf < p) lo = mid; + else hi = mid; + if (hi - lo < 1e-7) break; + } + return 0.5 * (lo + hi); +} + +double beta_mean(double a, double b) { + if (a + b <= 0.0) return 0.5; + return a / (a + b); +} + +double beta_variance(double a, double b) { + double s = a + b; + if (s <= 0.0 || (s + 1.0) <= 0.0) return 0.0; + return (a * b) / (s * s * (s + 1.0)); +} + +double beta_std(double a, double b) { + return std::sqrt(beta_variance(a, b)); +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/beta_dist.h b/cpp/functions/datascience/beta_dist.h new file mode 100644 index 00000000..e270c3ef --- /dev/null +++ b/cpp/functions/datascience/beta_dist.h @@ -0,0 +1,33 @@ +#pragma once + +namespace fn::ds { + +// Distribucion Beta(a, b) y helpers numericos relacionados. Pure. +// +// log-Gamma via aproximacion de Lanczos (precision ~1e-15). +double lgamma_lanczos(double x); + +// log B(a, b) = lgamma(a) + lgamma(b) - lgamma(a+b). +double log_beta(double a, double b); + +// Beta PDF en x, parametros a, b. Soporta a > 0, b > 0; fuera del soporte +// devuelve 0. Computado en log-space para estabilidad en a/b extremos. +double beta_pdf(double x, double a, double b); + +// Beta CDF (regularized incomplete beta function I_x(a, b)). Continued +// fraction algorithm, precision ~1e-12. +double beta_cdf(double x, double a, double b); + +// Beta quantile (inverse CDF) por busqueda Newton + bisection fallback. +// p en [0, 1]. Tolerancia 1e-6 sobre x. +double beta_quantile(double p, double a, double b); + +// Mean = a / (a + b). +double beta_mean(double a, double b); + +// Variance = (a*b) / ((a+b)^2 * (a+b+1)). +double beta_variance(double a, double b); + +double beta_std(double a, double b); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/beta_dist.md b/cpp/functions/datascience/beta_dist.md new file mode 100644 index 00000000..285f869b --- /dev/null +++ b/cpp/functions/datascience/beta_dist.md @@ -0,0 +1,75 @@ +--- +name: beta_dist +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "double lgamma_lanczos(double x); double log_beta(double a, double b); double beta_pdf(double x, double a, double b); double beta_cdf(double x, double a, double b); double beta_quantile(double p, double a, double b); double beta_mean(double a, double b); double beta_variance(double a, double b); double beta_std(double a, double b)" +description: "Distribucion Beta(a,b) completa: log-Gamma (Lanczos), log B(a,b), pdf, cdf (incomplete beta via continued fraction), quantile (bisection), mean/var/std. Para inferencia Bayesiana Beta-Binomial (mcmc-bayes / mcmc-full)." +tags: [beta, distribution, bayesian, lgamma, incomplete_beta, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cmath] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/beta_dist.cpp" +params: + - name: x + desc: "Soporte de la distribucion en [0, 1]. Fuera devuelve 0 (pdf) o se clamp (cdf)." + - name: a + desc: "Parametro alpha (>0)." + - name: b + desc: "Parametro beta (>0)." + - name: p + desc: "(quantile) Probabilidad en [0, 1]." +output: "Escalares double. Precision: lgamma ~1e-15, cdf ~1e-12, quantile ~1e-7. log_beta y beta_pdf computados en log-space para evitar overflow con a/b grandes." +--- + +# beta_dist + +Pack completo para inferencia Beta-Binomial. Soporta los 3 calculadores Bayesianos del set (mcmc-bayes, mcmc-full, y el targetPDF de mcmc-lab si se cambia a Beta). + +## Algoritmos + +| Funcion | Algoritmo | Precision | +|---|---|---| +| `lgamma_lanczos` | Lanczos g=7, n=9 + reflection x<0.5 | ~1e-15 | +| `beta_pdf` | log-space exp((a-1)*log(x) + (b-1)*log(1-x) - log B) | full fp64 | +| `beta_cdf` | I_x(a,b) via continued fraction (NR 6.4) | ~1e-12 | +| `beta_quantile` | bisection (60 iter, tol 1e-7) | ~1e-7 | +| `beta_mean/var/std` | formulas cerradas | exacto modulo fp | + +## Uso (Bayesian inference) + +```cpp +// Posterior Beta(alpha + k, beta + n - k) tras k exitos en n trials con +// prior Beta(alpha, beta). +double a_post = alpha + k; +double b_post = beta + (n - k); + +double map = (a_post - 1.0) / (a_post + b_post - 2.0); // moda +double mean = fn::ds::beta_mean(a_post, b_post); +double std = fn::ds::beta_std (a_post, b_post); + +// CI 95% via quantiles +double lo = fn::ds::beta_quantile(0.025, a_post, b_post); +double hi = fn::ds::beta_quantile(0.975, a_post, b_post); + +// Densidad para plotear +for (int i = 0; i <= 100; ++i) { + double x = i / 100.0; + double y = fn::ds::beta_pdf(x, a_post, b_post); + // ... feed a line_plot +} +``` + +## Notas + +- La continued fraction converge en <50 iteraciones para `a, b` razonables (<1000); para parametros muy grandes (>1e4) considerar regularized incomplete beta de la libreria estandar — pero `std::lgamma` no esta garantizado portable bit-exact entre toolchains, por eso esta implementacion es self-contained. +- `beta_quantile` es bisection puro: ~60 iter siempre, robusto pero no maximalmente eficiente. Newton encadenado a `beta_cdf` y `beta_pdf` daria 5-10 iter pero requiere care con los bordes. +- Para `a < 1` o `b < 1` la PDF tiene singularidades en los bordes — la implementacion devuelve 0 estrictamente fuera de (0,1) para evitar inf. diff --git a/cpp/functions/datascience/drawdown.cpp b/cpp/functions/datascience/drawdown.cpp new file mode 100644 index 00000000..2a476bc2 --- /dev/null +++ b/cpp/functions/datascience/drawdown.cpp @@ -0,0 +1,48 @@ +#include "datascience/drawdown.h" + +namespace fn::ds { + +DrawdownResult drawdown_max(const double* equity, std::size_t n) { + DrawdownResult r{}; + if (equity == nullptr || n == 0) return r; + + double peak = equity[0]; + std::size_t peak_i = 0; + double max_dd = 0.0; + std::size_t best_peak_i = 0; + std::size_t best_trough_i = 0; + + for (std::size_t i = 0; i < n; ++i) { + double v = equity[i]; + if (v > peak) { + peak = v; + peak_i = i; + } else { + double dd = peak - v; + if (dd > max_dd) { + max_dd = dd; + best_peak_i = peak_i; + best_trough_i = i; + } + } + } + r.max_dd = max_dd; + r.peak_idx = best_peak_i; + r.trough_idx = best_trough_i; + double peak_value = equity[best_peak_i]; + if (peak_value > 0.0) { + r.max_dd_pct = max_dd / peak_value; + } + return r; +} + +void drawdown_series(const double* equity, std::size_t n, double* out) { + if (equity == nullptr || out == nullptr || n == 0) return; + double peak = equity[0]; + for (std::size_t i = 0; i < n; ++i) { + if (equity[i] > peak) peak = equity[i]; + out[i] = peak - equity[i]; + } +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/drawdown.h b/cpp/functions/datascience/drawdown.h new file mode 100644 index 00000000..77befc1b --- /dev/null +++ b/cpp/functions/datascience/drawdown.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace fn::ds { + +struct DrawdownResult { + double max_dd = 0.0; // peak - trough + double max_dd_pct = 0.0; // (peak - trough) / peak (si peak > 0) + std::size_t peak_idx = 0; // indice donde ocurre el peak antes del trough + std::size_t trough_idx = 0; // indice del trough que crea el max_dd +}; + +// Calcula el max drawdown sobre una serie de balance/equity. Recorrido +// O(n). Equity[i] puede ser negativo (no valid para max_dd_pct, que se +// devuelve 0 si peak <= 0). +DrawdownResult drawdown_max(const double* equity, std::size_t n); + +// Llena out[n] con el drawdown actual en cada punto: out[i] = peak_so_far - equity[i]. +// Util para plotear underwater chart. +void drawdown_series(const double* equity, std::size_t n, double* out); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/drawdown.md b/cpp/functions/datascience/drawdown.md new file mode 100644 index 00000000..9f43b3d0 --- /dev/null +++ b/cpp/functions/datascience/drawdown.md @@ -0,0 +1,57 @@ +--- +name: drawdown +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "DrawdownResult drawdown_max(const double* equity, size_t n); void drawdown_series(const double* equity, size_t n, double* out)" +description: "Max drawdown sobre serie de equity/balance: peak-to-trough absoluto y porcentual + indices del peak y trough relevantes. drawdown_series llena un array con el underwater chart (peak_so_far - equity[i] en cada punto)." +tags: [drawdown, equity, finance, underwater, montecarlo, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/drawdown.cpp" +params: + - name: equity + desc: "Serie de balance/equity (ej. balance de la sesion en cada spin de vr_tiered_lab, o NAV diario)." + - name: n + desc: "Longitud de la serie." + - name: out + desc: "(series) buffer destino double[n] con el drawdown corriente en cada punto." +output: "DrawdownResult con max_dd, max_dd_pct (relativo al peak), peak_idx y trough_idx. drawdown_series llena out con peak_so_far - equity[i]." +--- + +# drawdown + +Metrica clave de los simuladores de sesiones (vr_tiered_lab) y de cualquier backtest. Se calcula en una pasada O(n). + +## Patron + +```cpp +std::vector balance(N); +// ... sesion simulada llena balance[] ... + +auto dd = fn::ds::drawdown_max(balance.data(), N); +// dd.max_dd = mayor caida absoluta +// dd.max_dd_pct = mayor caida porcentual (relativa al peak previo) +// dd.peak_idx = step donde estaba el peak +// dd.trough_idx = step donde la caida es maxima + +// Underwater chart +std::vector uw(N); +fn::ds::drawdown_series(balance.data(), N, uw.data()); +fn::viz::line_plot(uw.data(), N, /* fill negativo */); +``` + +## Notas + +- `max_dd_pct` se calcula relativo al peak. Si el peak es <=0 (la serie nunca cruzo cero), devuelve 0 — un drawdown porcentual sobre balance no positivo no esta bien definido. +- Para distribuciones de drawdown sobre N sesiones simuladas Monte Carlo: bucle externo llamando `drawdown_max` por sesion, y luego `stats_quantile` sobre el array de max_dd para CIs. Coste despreciable. +- No es trade analytics completo — no calcula recovery time, time underwater, etc. Se puede añadir si los calculadores lo piden. diff --git a/cpp/functions/datascience/metropolis_hastings.cpp b/cpp/functions/datascience/metropolis_hastings.cpp new file mode 100644 index 00000000..308d85d6 --- /dev/null +++ b/cpp/functions/datascience/metropolis_hastings.cpp @@ -0,0 +1,96 @@ +#include "datascience/metropolis_hastings.h" + +#include +#include + +namespace fn::ds { + +MHResult mh_run_1d(const std::function& target_log_pdf, + double x0, + double proposal_sigma, + std::size_t n_samples, + double* out_chain, + Rng& r) { + MHResult res{}; + if (n_samples == 0 || out_chain == nullptr) return res; + out_chain[0] = x0; + double curr = x0; + double log_p_curr = target_log_pdf(curr); + std::size_t accepted = 0; + + for (std::size_t i = 1; i < n_samples; ++i) { + double prop = curr + proposal_sigma * rng_normal(r); + double log_p_prop = target_log_pdf(prop); + double log_alpha = log_p_prop - log_p_curr; + bool accept = false; + if (log_alpha >= 0.0) { + accept = true; + } else { + double u = rng_uniform(r); + if (u < std::exp(log_alpha)) accept = true; + } + if (accept) { + curr = prop; + log_p_curr = log_p_prop; + ++accepted; + } + out_chain[i] = curr; + } + res.n_samples = n_samples; + res.n_accepted = accepted; + res.accept_rate = (n_samples > 1) + ? static_cast(accepted) / static_cast(n_samples - 1) + : 0.0; + return res; +} + +MHResult mh_run_nd(const std::function& target_log_pdf, + const double* x0, + const double* proposal_sigma, + int d, + std::size_t n_samples, + double* out_chain, + Rng& r) { + MHResult res{}; + if (d <= 0 || n_samples == 0 || out_chain == nullptr || + x0 == nullptr || proposal_sigma == nullptr) return res; + + std::vector curr(d); + std::vector prop(d); + for (int k = 0; k < d; ++k) curr[k] = x0[k]; + for (int k = 0; k < d; ++k) out_chain[k] = curr[k]; + + double log_p_curr = target_log_pdf(curr.data()); + std::size_t accepted = 0; + + for (std::size_t i = 1; i < n_samples; ++i) { + for (int k = 0; k < d; ++k) { + prop[k] = curr[k] + proposal_sigma[k] * rng_normal(r); + } + double log_p_prop = target_log_pdf(prop.data()); + double log_alpha = log_p_prop - log_p_curr; + bool accept = false; + if (log_alpha >= 0.0) { + accept = true; + } else { + double u = rng_uniform(r); + if (u < std::exp(log_alpha)) accept = true; + } + if (accept) { + for (int k = 0; k < d; ++k) curr[k] = prop[k]; + log_p_curr = log_p_prop; + ++accepted; + } + for (int k = 0; k < d; ++k) { + out_chain[i * static_cast(d) + k] = curr[k]; + } + } + res.n_samples = n_samples; + res.n_accepted = accepted; + res.accept_rate = (n_samples > 1) + ? static_cast(accepted) / static_cast(n_samples - 1) + : 0.0; + return res; +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/metropolis_hastings.h b/cpp/functions/datascience/metropolis_hastings.h new file mode 100644 index 00000000..1af9c4b9 --- /dev/null +++ b/cpp/functions/datascience/metropolis_hastings.h @@ -0,0 +1,44 @@ +#pragma once + +#include "datascience/rng.h" +#include +#include + +namespace fn::ds { + +struct MHResult { + std::size_t n_samples = 0; + std::size_t n_accepted = 0; + double accept_rate = 0.0; // n_accepted / max(n_samples-1, 1) +}; + +// Metropolis-Hastings 1D con proposal Gaussian symmetric. +// +// target_log_pdf(x): log-densidad target (no necesita normalizarse). +// x0: punto inicial. +// proposal_sigma: stddev del proposal (Gaussian centrada en current). +// n_samples: cuantos samples generar (incluido x0). +// out_chain: buffer destino double[n_samples]. +// r: estado RNG mutado in-place. +// +// El sample [0] es x0; [i] es la cadena tras i steps. +MHResult mh_run_1d(const std::function& target_log_pdf, + double x0, + double proposal_sigma, + std::size_t n_samples, + double* out_chain, + Rng& r); + +// Metropolis-Hastings d-dimensional con proposal Gaussian symmetric (cada +// dim independiente con stddev proposal_sigma[d]). x0 y out_chain en +// layout row-major: out_chain[i*d + k] = sample i, dim k. proposal_sigma +// y x0 son arrays de tamano d. +MHResult mh_run_nd(const std::function& target_log_pdf, + const double* x0, + const double* proposal_sigma, + int d, + std::size_t n_samples, + double* out_chain, + Rng& r); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/metropolis_hastings.md b/cpp/functions/datascience/metropolis_hastings.md new file mode 100644 index 00000000..08de3f38 --- /dev/null +++ b/cpp/functions/datascience/metropolis_hastings.md @@ -0,0 +1,92 @@ +--- +name: metropolis_hastings +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "MHResult mh_run_1d(const std::function& log_pdf, double x0, double sigma, size_t n, double* out, Rng&); MHResult mh_run_nd(const std::function& log_pdf, const double* x0, const double* sigma, int d, size_t n, double* out, Rng&)" +description: "Metropolis-Hastings 1D y d-dimensional con proposal Gaussian symmetric. Target log-pdf inyectable via std::function (no necesita normalizarse). Devuelve cadena en out[] y accept rate. Pareja CPU del mc_metropolis_hastings_gpu." +tags: [mcmc, metropolis, hastings, sampling, bayesian, datascience] +uses_functions: ["rng_cpp_datascience"] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef, functional, cmath, vector] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/metropolis_hastings.cpp" +params: + - name: target_log_pdf + desc: "Functor con la log-densidad target (no normalizada). 1D recibe double; ND recibe const double* de tamano d." + - name: x0 + desc: "Punto inicial (escalar 1D, array d-dim ND)." + - name: proposal_sigma + desc: "Stddev del proposal Gaussian. 1D escalar; ND array de d stddevs (uno por dimension)." + - name: d + desc: "(ND) dimension del espacio." + - name: n_samples + desc: "Total de samples a generar (incluyendo x0)." + - name: out_chain + desc: "Buffer destino. 1D: double[n]. ND: double[n*d] row-major (sample i, dim k = out[i*d+k])." + - name: r + desc: "Rng (de rng_cpp_datascience). State mutado durante el sampling." +output: "MHResult con n_samples, n_accepted y accept_rate. Tipico target: 0.20-0.40 accept rate (ajustar proposal_sigma)." +--- + +# metropolis_hastings + +Random-walk Metropolis estandar. Replica el sampler que usan los 4 calculadores MCMC del set, en formato componible. + +## Patron 1D (mcmc-bayes / mcmc-full Beta posterior) + +```cpp +double k = 7.0, n_obs = 10.0; // observados +auto log_post = [k, n_obs](double theta) -> double { + if (theta <= 0.0 || theta >= 1.0) return -1e300; + // Beta(2, 2) prior + Binomial likelihood + return (k + 1.0) * std::log(theta) + + (n_obs - k + 1.0) * std::log(1.0 - theta); +}; + +fn::ds::Rng r; +fn::ds::rng_seed(r, 42); + +std::vector chain(10000); +auto res = fn::ds::mh_run_1d(log_post, + /*x0=*/0.5, + /*sigma=*/0.1, + chain.size(), + chain.data(), + r); +// Burn-in: descartar primeros 1000 +// res.accept_rate ~ 0.3 si sigma esta bien ajustada +``` + +## Patron 2D (mcmc-visualizer) + +```cpp +auto log_density = [](const double* xy) -> double { + double x = xy[0], y = xy[1]; + // Bimodal target del visualizer + double d1 = (x-2)*(x-2) + (y-2)*(y-2); + double d2 = (x+2)*(x+2) + (y+2)*(y+2); + return std::log(std::exp(-0.5*d1) + std::exp(-0.5*d2)); +}; + +double x0[2] = {0.0, 0.0}; +double sigma[2] = {1.0, 1.0}; +std::vector chain(10000 * 2); + +fn::ds::mh_run_nd(log_density, x0, sigma, 2, + 10000, chain.data(), r); +``` + +## Notas + +- **No hay tuning automatico** del proposal sigma. Si el accept rate es <0.1, baja sigma; si >0.6, sube. Para tunings serios usar adaptive MH (no incluido aqui). +- `target_log_pdf` se evalua 2 veces por sample en el peor caso (current cached, proposal nuevo). Si tu log-pdf es caro (10ms+ por eval), N=10^4 cuesta minutos — para esos casos preferir GPU MH (no caben dependencias caras en GLSL — tienes que portar la log-pdf). +- Compatible con `rhat_split` y `ess_basic`: corre M cadenas con seeds distintos, apila en `chains[m*n + i]`, llama los diagnosticos. +- 1D y ND comparten la maquinaria pero ND lleva el coste de los `std::vector` por sample. Si rendimiento importa y conoces d en compile-time, fork-and-specialize. diff --git a/cpp/functions/datascience/rhat_ess.cpp b/cpp/functions/datascience/rhat_ess.cpp new file mode 100644 index 00000000..e706568c --- /dev/null +++ b/cpp/functions/datascience/rhat_ess.cpp @@ -0,0 +1,88 @@ +#include "datascience/rhat_ess.h" +#include "datascience/autocorr.h" + +#include +#include + +namespace fn::ds { + +static double rhat_core(const double* chains, std::size_t m, std::size_t n) { + if (chains == nullptr || m < 2 || n < 2) return 1.0; + + // Mean por cadena. + std::vector means(m, 0.0); + for (std::size_t j = 0; j < m; ++j) { + const double* c = chains + j * n; + double s = 0.0; + for (std::size_t i = 0; i < n; ++i) s += c[i]; + means[j] = s / static_cast(n); + } + + // Grand mean. + double grand = 0.0; + for (std::size_t j = 0; j < m; ++j) grand += means[j]; + grand /= static_cast(m); + + // Between-chain variance B. + double B = 0.0; + for (std::size_t j = 0; j < m; ++j) { + double d = means[j] - grand; + B += d * d; + } + B *= static_cast(n) / static_cast(m - 1); + + // Within-chain variance W (promedio de las varianzas muestrales). + double W = 0.0; + for (std::size_t j = 0; j < m; ++j) { + const double* c = chains + j * n; + double s = 0.0; + for (std::size_t i = 0; i < n; ++i) { + double d = c[i] - means[j]; + s += d * d; + } + W += s / static_cast(n - 1); + } + W /= static_cast(m); + + if (W <= 0.0) return 1.0; + + double n_d = static_cast(n); + double var_hat = ((n_d - 1.0) * W + B) / n_d; + return std::sqrt(var_hat / W); +} + +double rhat(const double* chains, std::size_t m, std::size_t n) { + return rhat_core(chains, m, n); +} + +double rhat_split(const double* chains, std::size_t m, std::size_t n) { + if (chains == nullptr || m < 1 || n < 4) return 1.0; + std::size_t half = n / 2; + std::size_t m2 = m * 2; + // Reorganizar a (2m, half) row-major. Copia explicita: la segunda mitad + // de cada cadena no es contigua a la primera. + std::vector split(m2 * half); + for (std::size_t j = 0; j < m; ++j) { + const double* c = chains + j * n; + double* a = split.data() + (2 * j) * half; + double* b = split.data() + (2 * j + 1) * half; + for (std::size_t i = 0; i < half; ++i) a[i] = c[i]; + for (std::size_t i = 0; i < half; ++i) b[i] = c[half + i]; + } + return rhat_core(split.data(), m2, half); +} + +double ess_basic(const double* chains, std::size_t m, std::size_t n, + std::size_t max_lag, double cutoff) { + if (chains == nullptr || m == 0 || n < 2) return 0.0; + double total = 0.0; + for (std::size_t j = 0; j < m; ++j) { + const double* c = chains + j * n; + double tau = autocorr_tau(c, n, max_lag, cutoff); + if (tau < 1.0) tau = 1.0; + total += static_cast(n) / tau; + } + return total; +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/rhat_ess.h b/cpp/functions/datascience/rhat_ess.h new file mode 100644 index 00000000..d3295d99 --- /dev/null +++ b/cpp/functions/datascience/rhat_ess.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace fn::ds { + +// Diagnosticos multi-chain para MCMC. Las cadenas estan dispuestas como +// chains[m * n + i] (row-major: m chains, n samples cada una). Todas las +// cadenas deben tener la misma longitud n. +// +// rhat: Gelman-Rubin estandar. R_hat = sqrt(((n-1)*W + B) / (n*W)) donde +// W = within-chain variance promedio, B = between-chain variance. +// Convergencia tipica: R_hat < 1.01. +double rhat(const double* chains, std::size_t m, std::size_t n); + +// rhat_split: variante moderna (Stan / pymc) que parte cada cadena en dos +// mitades antes de computar R_hat sobre 2m sub-cadenas. Detecta no-mixing +// en cadenas que parecen estables pero estan stuck en distintos modos. +double rhat_split(const double* chains, std::size_t m, std::size_t n); + +// ess_basic: Effective Sample Size por cadena, suma sobre cadenas. +// ESS_chain = n / tau_int(chain), ESS_total = sum_chains. +// Conservador para multi-chain (no usa la varianza between). +double ess_basic(const double* chains, std::size_t m, std::size_t n, + std::size_t max_lag = 200, double cutoff = 0.05); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/rhat_ess.md b/cpp/functions/datascience/rhat_ess.md new file mode 100644 index 00000000..aaca6f25 --- /dev/null +++ b/cpp/functions/datascience/rhat_ess.md @@ -0,0 +1,65 @@ +--- +name: rhat_ess +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "double rhat(const double* chains, size_t m, size_t n); double rhat_split(const double* chains, size_t m, size_t n); double ess_basic(const double* chains, size_t m, size_t n, size_t max_lag, double cutoff)" +description: "Diagnosticos multi-chain MCMC: Gelman-Rubin R-hat (clasico y split), y Effective Sample Size basico. Cadenas en layout row-major chains[j*n + i]. Convergencia tipica R_hat < 1.01." +tags: [mcmc, rhat, ess, gelman_rubin, convergence, datascience] +uses_functions: ["autocorr_cpp_datascience"] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef, cmath, vector] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/rhat_ess.cpp" +params: + - name: chains + desc: "Buffer row-major chains[j * n + i] con m cadenas de n samples cada una. Layout estable de cualquier MCMC sampler que produzca arrays apilados." + - name: m + desc: "Numero de cadenas (>=2 para rhat clasico)." + - name: n + desc: "Samples por cadena (>=2 clasico, >=4 split)." + - name: max_lag + desc: "(ess_basic) lag maximo de la ACF para tau_int. Default 200." + - name: cutoff + desc: "(ess_basic) umbral |r(k)| para truncar la suma. Default 0.05." +output: "rhat / rhat_split: escalar >= 1; valores < 1.01 indican convergencia razonable. ess_basic: suma de ESS por cadena (no corregido por between-chain variance)." +--- + +# rhat_ess + +Diagnosticos estandar de convergencia para MCMC, esenciales en `mcmc-lab` (que precisamente computa R-hat sobre multi-chain). + +## R-hat clasico vs split + +`rhat` es la formula original de Gelman-Rubin (1992). `rhat_split` (Stan / pymc moderno) parte cada cadena en dos mitades antes del calculo — detecta cadenas que parecen estables pero estan stuck en modos distintos al inicio vs fin. **Recomendado: usar rhat_split por defecto.** + +## Patron tipico + +```cpp +constexpr int M = 8, N = 10000; +std::vector chains(M * N); +// ... rellenar con tu sampler ... + +double r = fn::ds::rhat_split(chains.data(), M, N); +double ess = fn::ds::ess_basic(chains.data(), M, N); + +if (r > 1.01) { + // No convergido — correr mas iteraciones o reseed. +} +``` + +## Layout + +`chains[j * n + i]` = sample `i` de la cadena `j`. Asi cada cadena es un slice contiguo, accesible con `chains + j * n`. Esto es lo que produce naturalmente un sampler que persiste un SSBO de samples (cada chain ocupa un bloque contiguo). + +## Notas + +- ESS basico no usa la formula multi-chain de Stan (que combina B y W). Es conservador (subestima ESS cuando las cadenas estan bien mezcladas) pero suficiente como primer indicador. +- Para inferencia formal en produccion, considerar pymc/arviz que tienen las versiones bias-corrected y rank-normalized R-hat. Aqui buscamos diagnostico interactivo, no certificacion estadistica. diff --git a/cpp/functions/datascience/rng.cpp b/cpp/functions/datascience/rng.cpp new file mode 100644 index 00000000..aa2d8d8d --- /dev/null +++ b/cpp/functions/datascience/rng.cpp @@ -0,0 +1,94 @@ +#include "datascience/rng.h" + +#include + +namespace fn::ds { + +static inline std::uint64_t rotl(std::uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); +} + +// SplitMix64 step — usado solo para seedear los 4 lanes de xoshiro256++. +static inline std::uint64_t splitmix64(std::uint64_t& state) { + state += 0x9E3779B97F4A7C15ULL; + std::uint64_t z = state; + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + return z ^ (z >> 31); +} + +void rng_seed(Rng& r, std::uint64_t seed) { + if (seed == 0ULL) seed = 0x9E3779B97F4A7C15ULL; + std::uint64_t s = seed; + r.s[0] = splitmix64(s); + r.s[1] = splitmix64(s); + r.s[2] = splitmix64(s); + r.s[3] = splitmix64(s); +} + +// xoshiro256++ (Vigna). 1.24 ns/u64 en x86, supera PractRand 32 TB. +std::uint64_t rng_u64(Rng& r) { + const std::uint64_t result = rotl(r.s[0] + r.s[3], 23) + r.s[0]; + const std::uint64_t t = r.s[1] << 17; + r.s[2] ^= r.s[0]; + r.s[3] ^= r.s[1]; + r.s[1] ^= r.s[2]; + r.s[0] ^= r.s[3]; + r.s[2] ^= t; + r.s[3] = rotl(r.s[3], 45); + return result; +} + +double rng_uniform(Rng& r) { + // 53 bits superiores -> double en [0, 1). + return (rng_u64(r) >> 11) * (1.0 / 9007199254740992.0); +} + +double rng_normal(Rng& r) { + // Box-Muller. Descarta una de las dos normales (suficientemente rapido + // para la mayoria de usos; si hace falta cachear la otra, anadir un + // flag al Rng). + double u1 = rng_uniform(r); + if (u1 < 1e-300) u1 = 1e-300; + double u2 = rng_uniform(r); + return std::sqrt(-2.0 * std::log(u1)) * + std::cos(6.28318530717958647692 * u2); +} + +std::uint64_t rng_below(Rng& r, std::uint64_t n) { + if (n == 0ULL) return 0ULL; + // Lemire's method (2019): rejection-sampling sin division en el caso + // comun. Sesgo nulo. + std::uint64_t x = rng_u64(r); + __uint128_t m = static_cast<__uint128_t>(x) * static_cast<__uint128_t>(n); + std::uint64_t l = static_cast(m); + if (l < n) { + std::uint64_t t = (~n + 1ULL) % n; // (-n) mod n + while (l < t) { + x = rng_u64(r); + m = static_cast<__uint128_t>(x) * static_cast<__uint128_t>(n); + l = static_cast(m); + } + } + return static_cast(m >> 64); +} + +int rng_categorical(Rng& r, const double* weights, int n) { + if (n <= 0 || weights == nullptr) return 0; + double total = 0.0; + for (int i = 0; i < n; ++i) { + if (weights[i] > 0.0) total += weights[i]; + } + if (total <= 0.0) return n - 1; + double u = rng_uniform(r) * total; + double acc = 0.0; + for (int i = 0; i < n; ++i) { + if (weights[i] > 0.0) { + acc += weights[i]; + if (u < acc) return i; + } + } + return n - 1; +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/rng.h b/cpp/functions/datascience/rng.h new file mode 100644 index 00000000..97debc40 --- /dev/null +++ b/cpp/functions/datascience/rng.h @@ -0,0 +1,35 @@ +#pragma once + +#include + +namespace fn::ds { + +// Estado RNG opaco (xoshiro256++). Periodo 2^256 - 1, paso muy rapido, +// excelente calidad estadistica. No criptografico. +struct Rng { + std::uint64_t s[4] = {0, 0, 0, 0}; +}; + +// Inicializa el state de una Rng a partir de una semilla maestra usando +// SplitMix64. seed=0 se sustituye por la constante de Knuth para evitar +// arranque degenerado. Funcion pura (modifica r in-place a partir de seed). +void rng_seed(Rng& r, std::uint64_t seed); + +// Avanza el RNG y devuelve un uint64 uniformemente distribuido. +std::uint64_t rng_u64(Rng& r); + +// Float [0, 1). 53 bits de mantisa. +double rng_uniform(Rng& r); + +// N(0, 1) Box-Muller polar. +double rng_normal(Rng& r); + +// Entero uniforme en [0, n). Rejection-sampling para evitar sesgo modular. +std::uint64_t rng_below(Rng& r, std::uint64_t n); + +// Sample categorico: dado un array de probabilidades NO necesariamente +// normalizadas (positivas), devuelve el indice [0, n) con probabilidad +// proporcional. O(n). Si la suma es 0 devuelve n-1. +int rng_categorical(Rng& r, const double* weights, int n); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/rng.md b/cpp/functions/datascience/rng.md new file mode 100644 index 00000000..d6dec24b --- /dev/null +++ b/cpp/functions/datascience/rng.md @@ -0,0 +1,63 @@ +--- +name: rng +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "void rng_seed(Rng&, uint64 seed); uint64 rng_u64(Rng&); double rng_uniform(Rng&); double rng_normal(Rng&); uint64 rng_below(Rng&, uint64 n); int rng_categorical(Rng&, const double* weights, int n)" +description: "Generador pseudoaleatorio xoshiro256++ con state inout (struct Rng). Helpers: uniform, normal (Box-Muller), below (Lemire sin sesgo), categorical (O(n) cumulative). Determinista dado seed — pareja CPU del gpu_rng_glsl." +tags: [rng, xoshiro, uniform, normal, categorical, montecarlo, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstdint, cmath] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/rng.cpp" +params: + - name: seed + desc: "Semilla maestra. 0 se sustituye por la constante de Knuth para evitar arranque degenerado." + - name: r + desc: "Estado RNG (struct con 4 uint64). Se muta in-place en cada llamada." + - name: n + desc: "(rng_below) limite superior exclusivo. (rng_categorical) numero de pesos." + - name: weights + desc: "(rng_categorical) array de pesos NO necesariamente normalizados. Pesos negativos o cero se ignoran." +output: "Numeros pseudoaleatorios deterministas dado el state inicial. rng_seed inicializa los 4 lanes via SplitMix64. Mismo patron que glsl_rng_preamble (gpu) para reutilizar GLSL kernels en CPU." +--- + +# rng + +Pareja CPU del `glsl_rng_preamble` (de `gpu_rng_glsl`). Misma semantica, mismas garantias. + +## Calidad + +xoshiro256++ (Vigna 2018): periodo 2^256 - 1, 1.2 ns/u64 en x86, supera PractRand 32 TB. Reemplaza al `std::mt19937_64` (mas lento y mayor estado, sin ventaja para Monte Carlo). + +`rng_normal` usa Box-Muller polar (~30 ns por sample). Para volumen extremo (>10^9 normals CPU) considerar Ziggurat — no incluido aqui por complejidad. + +`rng_below` usa el metodo de Lemire (2019): rejection sampling sin division en el caso comun, sesgo cero. Mejor que `rng_u64() % n` que tiene sesgo modular. + +## Uso + +```cpp +fn::ds::Rng r; +fn::ds::rng_seed(r, 0xC0FFEE); + +double x = fn::ds::rng_normal(r); // ~ N(0, 1) +double u = fn::ds::rng_uniform(r); // [0, 1) +int k = fn::ds::rng_below(r, 100); // 0..99 sin sesgo + +double weights[] = {0.5, 0.3, 0.2}; +int idx = fn::ds::rng_categorical(r, weights, 3); +``` + +## Notas + +- Determinista bit-exacto dado el seed. Util para tests numericos y para comparar contra el GLSL kernel (que usa PCG32 — distinto stream pero misma semantica de uniform/normal). +- No thread-safe en una unica Rng. Para Monte Carlo paralelo: una Rng por thread, sembrada con seeds derivados de un master via `rng_seed` con seeds distintos. +- Pure: la mutacion del state es referencialmente transparente (la misma secuencia de llamadas produce los mismos resultados). Sin I/O, sin globals. diff --git a/cpp/functions/datascience/samples_to_grid_2d.cpp b/cpp/functions/datascience/samples_to_grid_2d.cpp new file mode 100644 index 00000000..fcbe646d --- /dev/null +++ b/cpp/functions/datascience/samples_to_grid_2d.cpp @@ -0,0 +1,71 @@ +#include "datascience/samples_to_grid_2d.h" + +#include +#include + +namespace fn::ds { + +void samples_to_grid_2d_counts(const double* samples_x, + const double* samples_y, + std::size_t n, + double xmin, double xmax, + double ymin, double ymax, + int nx, int ny, + unsigned int* out_counts) { + if (out_counts == nullptr || nx <= 0 || ny <= 0) return; + if (samples_x == nullptr || samples_y == nullptr) return; + double xr = xmax - xmin; + double yr = ymax - ymin; + if (xr <= 0.0 || yr <= 0.0) return; + double inv_x = 1.0 / xr; + double inv_y = 1.0 / yr; + + for (std::size_t i = 0; i < n; ++i) { + double tx = (samples_x[i] - xmin) * inv_x; + double ty = (samples_y[i] - ymin) * inv_y; + if (tx < 0.0 || tx >= 1.0 || ty < 0.0 || ty >= 1.0) continue; + int bx = static_cast(tx * static_cast(nx)); + int by = static_cast(ty * static_cast(ny)); + if (bx >= nx) bx = nx - 1; + if (by >= ny) by = ny - 1; + ++out_counts[by * nx + bx]; + } +} + +void counts_to_density(const unsigned int* counts, int nx, int ny, + float* out_density) { + if (counts == nullptr || out_density == nullptr || nx <= 0 || ny <= 0) return; + std::size_t total = static_cast(nx) * + static_cast(ny); + unsigned int max_c = 0u; + for (std::size_t i = 0; i < total; ++i) { + if (counts[i] > max_c) max_c = counts[i]; + } + if (max_c == 0u) { + for (std::size_t i = 0; i < total; ++i) out_density[i] = 0.0f; + return; + } + float inv = 1.0f / static_cast(max_c); + for (std::size_t i = 0; i < total; ++i) { + out_density[i] = static_cast(counts[i]) * inv; + } +} + +void samples_to_grid_2d_density(const double* samples_x, + const double* samples_y, + std::size_t n, + double xmin, double xmax, + double ymin, double ymax, + int nx, int ny, + float* out_density) { + if (out_density == nullptr || nx <= 0 || ny <= 0) return; + std::size_t total = static_cast(nx) * + static_cast(ny); + std::vector counts(total, 0u); + samples_to_grid_2d_counts(samples_x, samples_y, n, + xmin, xmax, ymin, ymax, + nx, ny, counts.data()); + counts_to_density(counts.data(), nx, ny, out_density); +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/samples_to_grid_2d.h b/cpp/functions/datascience/samples_to_grid_2d.h new file mode 100644 index 00000000..56ce27e5 --- /dev/null +++ b/cpp/functions/datascience/samples_to_grid_2d.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace fn::ds { + +// Binning 2D puro CPU. samples_x[i], samples_y[i] son los pares (x, y) a +// binnear. nx*ny celdas en row-major (idx = row * nx + col), donde row +// crece con y. Samples fuera del rango se descartan. +// +// out_counts[nx*ny] se llena con conteos uint32. (No se hace clear interno — +// el caller decide si acumular sobre llamadas previas o no.) +void samples_to_grid_2d_counts(const double* samples_x, + const double* samples_y, + std::size_t n, + double xmin, double xmax, + double ymin, double ymax, + int nx, int ny, + unsigned int* out_counts); + +// Variante que normaliza los counts a float[0,1] sobre el max (densidad +// relativa). Adecuado para alimentar heatmap_cpp_viz / contour_cpp_viz / surface_plot_3d. +// Si todos los counts son 0 se llena out_density con 0. +void samples_to_grid_2d_density(const double* samples_x, + const double* samples_y, + std::size_t n, + double xmin, double xmax, + double ymin, double ymax, + int nx, int ny, + float* out_density); + +// Helper: convierte counts uint -> density float in-place sobre out. +// max_count se calcula y se normaliza out[i] = counts[i]/max_count. +void counts_to_density(const unsigned int* counts, int nx, int ny, + float* out_density); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/samples_to_grid_2d.md b/cpp/functions/datascience/samples_to_grid_2d.md new file mode 100644 index 00000000..fabab6cc --- /dev/null +++ b/cpp/functions/datascience/samples_to_grid_2d.md @@ -0,0 +1,93 @@ +--- +name: samples_to_grid_2d +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "void samples_to_grid_2d_counts(const double* x, const double* y, size_t n, double xmin, double xmax, double ymin, double ymax, int nx, int ny, unsigned int* out_counts); void samples_to_grid_2d_density(...float* out_density); void counts_to_density(const unsigned int* counts, int nx, int ny, float* out_density)" +description: "Binning 2D CPU para alimentar heatmap_cpp_viz / contour_cpp_viz / surface_plot_3d desde un set de samples (x[], y[]). Variante counts (uint, acumulable) y density (float [0,1] normalizado a max). Pareja CPU del gpu_histogram_2d." +tags: [binning, histogram_2d, density, heatmap, contour, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef, cmath, vector] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/samples_to_grid_2d.cpp" +params: + - name: samples_x + desc: "Array de coordenadas X de los samples." + - name: samples_y + desc: "Array de coordenadas Y de los samples." + - name: n + desc: "Numero de samples (longitud de samples_x y samples_y)." + - name: xmin + desc: "Limite inferior X. Samples con x fuera de [xmin, xmax) se descartan." + - name: xmax + desc: "Limite superior X." + - name: ymin + desc: "Limite inferior Y." + - name: ymax + desc: "Limite superior Y." + - name: nx + desc: "Bins en X." + - name: ny + desc: "Bins en Y." + - name: out_counts + desc: "Buffer destino unsigned int[nx*ny] row-major (idx = y*nx + x). NO se inicializa: el caller decide si poner a cero (acumular sobre llamadas previas)." + - name: out_density + desc: "Buffer destino float[nx*ny] normalizado a max=1.0. Variante density auto-rellena." + - name: counts + desc: "(counts_to_density) input uint[nx*ny]." +output: "out_counts incrementado con los conteos; out_density normalizado [0, 1]. Si todos los counts son 0, density se llena con 0." +--- + +# samples_to_grid_2d + +Binning 2D para densidades de muestras (joint posteriors, walks 2D, scatter density). Versión CPU canónica — usar GPU `gpu_histogram_2d` cuando los samples ya viven en GPU. + +## Patron tipico (mcmc-visualizer / mcmc-full) + +```cpp +constexpr int NX = 128, NY = 128; +std::vector density(NX * NY); + +fn::ds::samples_to_grid_2d_density( + chain_x.data(), chain_y.data(), chain_x.size(), + -5.0, 5.0, -5.0, 5.0, + NX, NY, + density.data() +); + +fn::viz::heatmap(density.data(), NX, NY, /*...*/); +``` + +## Acumulado a lo largo de iteraciones + +Para visualizar la cadena creciendo en vivo sin recomputar todo: + +```cpp +std::vector counts(NX * NY, 0u); + +// Cada iteracion del MCMC: +fn::ds::samples_to_grid_2d_counts( + new_x.data(), new_y.data(), new_x.size(), + -5.0, 5.0, -5.0, 5.0, + NX, NY, + counts.data() // sigue acumulando +); + +std::vector density(NX * NY); +fn::ds::counts_to_density(counts.data(), NX, NY, density.data()); +fn::viz::heatmap(density.data(), NX, NY, /*...*/); +``` + +## Notas + +- Samples fuera del rango `[xmin, xmax) x [ymin, ymax)` se descartan, no se clampean. Si quieres clamp, pre-clamp los samples antes de llamar. +- O(n) sobre samples + O(nx*ny) para to_density. Para n = 10^7, nx=ny=256: ~70 ms CPU. Para refresh interactivo a 60 FPS, downsample a n=10^5 o usar la version GPU. +- Layout row-major idx = y*nx + x — coincide con lo que esperan `heatmap_cpp_viz` y `contour_cpp_viz`. diff --git a/cpp/functions/datascience/stats_summary.cpp b/cpp/functions/datascience/stats_summary.cpp new file mode 100644 index 00000000..b7481062 --- /dev/null +++ b/cpp/functions/datascience/stats_summary.cpp @@ -0,0 +1,96 @@ +#include "datascience/stats_summary.h" + +#include +#include +#include +#include + +namespace fn::ds { + +double stats_sum(const double* data, std::size_t n) { + if (n == 0 || data == nullptr) return 0.0; + // Kahan summation — coste despreciable, evita drift en sumas grandes. + double s = 0.0, c = 0.0; + for (std::size_t i = 0; i < n; ++i) { + double y = data[i] - c; + double t = s + y; + c = (t - s) - y; + s = t; + } + return s; +} + +double stats_mean(const double* data, std::size_t n) { + if (n == 0) return 0.0; + return stats_sum(data, n) / static_cast(n); +} + +double stats_min(const double* data, std::size_t n) { + if (n == 0 || data == nullptr) return 0.0; + double m = data[0]; + for (std::size_t i = 1; i < n; ++i) if (data[i] < m) m = data[i]; + return m; +} + +double stats_max(const double* data, std::size_t n) { + if (n == 0 || data == nullptr) return 0.0; + double m = data[0]; + for (std::size_t i = 1; i < n; ++i) if (data[i] > m) m = data[i]; + return m; +} + +double stats_variance(const double* data, std::size_t n, bool sample) { + if (n == 0 || data == nullptr) return 0.0; + if (sample && n < 2) return 0.0; + // Welford one-pass. + double mean = 0.0; + double M2 = 0.0; + for (std::size_t i = 0; i < n; ++i) { + double x = data[i]; + double delta = x - mean; + mean += delta / static_cast(i + 1); + double delta2 = x - mean; + M2 += delta * delta2; + } + double denom = sample ? static_cast(n - 1) : static_cast(n); + return M2 / denom; +} + +double stats_std(const double* data, std::size_t n, bool sample) { + return std::sqrt(stats_variance(data, n, sample)); +} + +void stats_sort(const double* data, std::size_t n, double* out) { + if (n == 0 || out == nullptr) return; + if (out != data && data != nullptr) { + std::memcpy(out, data, n * sizeof(double)); + } + std::sort(out, out + n); +} + +double stats_quantile_sorted(const double* sorted, std::size_t n, double p) { + if (n == 0 || sorted == nullptr) return 0.0; + if (p <= 0.0) return sorted[0]; + if (p >= 1.0) return sorted[n - 1]; + // R type-7: h = (n-1) * p; result = sorted[floor(h)] + (h - floor(h)) * + // (sorted[floor(h)+1] - sorted[floor(h)]) + double h = (static_cast(n) - 1.0) * p; + std::size_t lo = static_cast(std::floor(h)); + std::size_t hi = lo + 1; + if (hi >= n) hi = n - 1; + double frac = h - static_cast(lo); + return sorted[lo] + frac * (sorted[hi] - sorted[lo]); +} + +double stats_quantile(const double* data, std::size_t n, double p) { + if (n == 0) return 0.0; + std::vector tmp(n); + stats_sort(data, n, tmp.data()); + return stats_quantile_sorted(tmp.data(), n, p); +} + +double stats_percentile(const double* data, std::size_t n, double pct) { + return stats_quantile(data, n, pct * 0.01); +} + +} // namespace fn::ds diff --git a/cpp/functions/datascience/stats_summary.h b/cpp/functions/datascience/stats_summary.h new file mode 100644 index 00000000..024b10c8 --- /dev/null +++ b/cpp/functions/datascience/stats_summary.h @@ -0,0 +1,35 @@ +#pragma once + +#include + +namespace fn::ds { + +// Estadistica descriptiva pura sobre arrays de double. +// Todas las funciones aceptan ptr + n; n=0 devuelve identidades sensatas. + +double stats_sum (const double* data, std::size_t n); +double stats_mean (const double* data, std::size_t n); +double stats_min (const double* data, std::size_t n); +double stats_max (const double* data, std::size_t n); + +// Welford one-pass (numericamente estable). Variance muestral si sample=true +// (denominador n-1), poblacional si sample=false (denominador n). +double stats_variance(const double* data, std::size_t n, bool sample = true); +double stats_std (const double* data, std::size_t n, bool sample = true); + +// Quantile lineal (R type-7). p en [0, 1]. NO modifica data — internamente +// hace copia + sort. Para multiples quantiles del mismo dataset, ordenar +// una vez con stats_sort y usar stats_quantile_sorted. +double stats_quantile(const double* data, std::size_t n, double p); + +// Variante para dataset ya ordenado ascendente. +double stats_quantile_sorted(const double* sorted, std::size_t n, double p); + +// Percentile = quantile(p/100). +double stats_percentile(const double* data, std::size_t n, double pct); + +// Helper: ordena ascendente in-place a un buffer destino dado. +// out debe tener al menos n elementos. Si out == data, ordena in-place. +void stats_sort(const double* data, std::size_t n, double* out); + +} // namespace fn::ds diff --git a/cpp/functions/datascience/stats_summary.md b/cpp/functions/datascience/stats_summary.md new file mode 100644 index 00000000..a12440d8 --- /dev/null +++ b/cpp/functions/datascience/stats_summary.md @@ -0,0 +1,69 @@ +--- +name: stats_summary +kind: function +lang: cpp +domain: datascience +version: "1.0.0" +purity: pure +signature: "double stats_sum(const double*, size_t); double stats_mean(const double*, size_t); double stats_min(const double*, size_t); double stats_max(const double*, size_t); double stats_variance(const double*, size_t, bool sample=true); double stats_std(const double*, size_t, bool sample=true); double stats_quantile(const double*, size_t, double p); double stats_quantile_sorted(const double*, size_t, double p); double stats_percentile(const double*, size_t, double pct); void stats_sort(const double*, size_t, double* out)" +description: "Estadistica descriptiva pura sobre arrays double: sum (Kahan), mean, min, max, variance/std (Welford one-pass, sample/poblacional), quantile (R type-7) y percentile. stats_sort externalizable para evitar copias en queries multiples." +tags: [stats, mean, variance, std, quantile, percentile, welford, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [cstddef, cmath, algorithm, vector, cstring] +tested: false +tests: [] +test_file_path: "" +file_path: "cpp/functions/datascience/stats_summary.cpp" +params: + - name: data + desc: "Array de doubles (no necesariamente ordenado salvo *_sorted)." + - name: n + desc: "Tamano del array. n=0 devuelve identidades sensatas (0 para sum/mean/min/max/var/std)." + - name: sample + desc: "Variance/std: true = muestral (n-1), false = poblacional (n). Default true." + - name: p + desc: "Quantile en [0, 1]. Valores fuera se clampean." + - name: pct + desc: "Percentile en [0, 100]. Internamente p = pct/100." + - name: out + desc: "(stats_sort) buffer destino. Si out == data, ordena in-place." +output: "Escalar (double) con la estadistica solicitada. stats_sort modifica out in-place; el resto no muta data." +--- + +# stats_summary + +Pack de estadisticas basicas sobre arrays raw. Diseñado para post-proceso de samples MC, sesiones de simulacion, cadenas MCMC. + +## Performance + +- `stats_sum`: Kahan summation (O(n), ~5% mas lento que sum naive pero sin drift en sumas de millones de fp64). +- `stats_variance`: Welford one-pass (O(n), una sola pasada). No hay las cancelaciones catastroficas del E[X^2] - E[X]^2 naive. +- `stats_quantile`: O(n log n) por copia + sort. Para multiples queries del mismo dataset, llamar `stats_sort` una vez y `stats_quantile_sorted` despues — O(n log n + Q). + +## Patron tipico + +Resumen de un session simulator (vr_tiered_lab): + +```cpp +std::vector pnls(N); +// ... rellenar pnls ... + +double mean = fn::ds::stats_mean(pnls.data(), N); +double std = fn::ds::stats_std (pnls.data(), N); + +// CI 95% via percentiles 2.5 / 97.5 +std::vector sorted(N); +fn::ds::stats_sort(pnls.data(), N, sorted.data()); +double p025 = fn::ds::stats_quantile_sorted(sorted.data(), N, 0.025); +double p975 = fn::ds::stats_quantile_sorted(sorted.data(), N, 0.975); +``` + +## Notas + +- El convenio R type-7 para quantiles es el mismo que numpy default (`linear`) y matplotlib. Pasar tests numericos contra numpy debe matchear bit-exacto. +- `sample=true` (default) coincide con `np.var(x, ddof=1)` y `pd.DataFrame.var()`. +- Para datasets enormes que no caben en RAM, usar `gpu_reduce` (GPU) — esta libreria es CPU-side.