feat(cpp/datascience): GPU Monte Carlo kernels (K1-K3)

Tres kernels Monte Carlo intensivos sobre las primitivas G1-G7 + las puras
CPU como oraculo de tests numericos. Apuntados a hyper-paralelizar los
calculadores de sources/calculadoras (vr_tiered_lab, mcmc-bayes / full / lab,
mcmc-visualizer) en RTX-class GPUs.

- mc_session_sim_gpu (K1): N sesiones independientes de K spins en paralelo
  (1 thread = 1 sesion). Modelo variable-ratio escalonado con tiers (q, m),
  modes Pure/Pity/Streak, miss_streak, drawdown. SSBOs summary[N*8] y
  tier_counts[N*max_tiers]. Portea vr_tiered_lab.
- mc_metropolis_hastings_gpu (K2): M cadenas independientes 1D. Target
  log-pdf inyectable como string GLSL (mismo patron de gl_shader). u_user[16]
  para cambiar parametros desde sliders sin recompilar. Output compatible
  con rhat_split / ess_basic.
- mc_random_walk_2d_gpu (K3): walkers 2D MH con trace_xy xy-interleaved en
  SSBO; pasable directamente a gpu_histogram_2d sin readback intermedio.
  Pipeline GPU-only para mcmc-visualizer.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 11:52:41 +02:00
parent d76c831247
commit 9d69953110
9 changed files with 1123 additions and 0 deletions
@@ -0,0 +1,246 @@
#include "datascience/mc_session_sim_gpu.h"
#include "gfx/gl_loader.h"
#include "gfx/gpu_compute_program.h"
#include "gfx/gpu_dispatch.h"
#include "gfx/gpu_rng_glsl.h"
#include <cstdio>
#include <vector>
namespace fn::ds {
constexpr int kMaxTiersHard = 8;
constexpr int kSeedBinding = 9;
// 1 thread = 1 sesion completa. Loops internamente sobre los spins. Como
// cada sesion es independiente y cada thread tiene su seed, no hace falta
// ningun atomic ni barrier dentro del workgroup. La unica restriccion es
// que cada thread procesa hasta MAX_TIERS_HARD = 8 contadores locales en
// registros (uint local_counts[8]).
static const char* k_body = R"glsl(
#define MAX_TIERS 8
layout(std430, binding = 0) buffer Summaries { float summary[]; };
layout(std430, binding = 1) buffer TierCounts { uint tier_counts[]; };
layout(std430, binding = 2) buffer Tiers { vec2 tiers[]; };
uniform uint u_n_sessions;
uniform uint u_n_spins;
uniform uint u_n_tiers;
uniform float u_p_base;
uniform float u_cost;
uniform float u_start_balance;
uniform int u_mode;
uniform float u_mode_p1;
uniform float u_mode_p2;
void main() {
uint sid = gl_GlobalInvocationID.x;
if (sid >= u_n_sessions) return;
uint s = rng_seeds[sid];
float balance = u_start_balance;
float peak = balance;
float trough = balance;
float max_dd = 0.0;
uint spins_done = 0u;
uint wins = 0u;
uint miss_streak = 0u;
uint longest_miss = 0u;
uint local_counts[MAX_TIERS];
for (int k = 0; k < MAX_TIERS; ++k) local_counts[k] = 0u;
float total_q = 0.0;
for (uint k = 0u; k < u_n_tiers; ++k) total_q += tiers[k].x;
float inv_total = (total_q > 0.0) ? (1.0 / total_q) : 0.0;
for (uint i = 0u; i < u_n_spins; ++i) {
if (balance < u_cost) break;
// Effective probability (modes Pure / Pity / Streak)
float p_eff = u_p_base;
if (u_mode == 1) {
float ms = float(miss_streak);
float soft = u_mode_p1;
float hard = u_mode_p2;
if (ms < soft) {
p_eff = u_p_base;
} else if (ms >= hard) {
p_eff = 1.0;
} else {
p_eff = u_p_base + (1.0 - u_p_base) *
((ms - soft) / max(hard - soft, 1.0));
}
} else if (u_mode == 2) {
p_eff = min(1.0, u_p_base * (1.0 + u_mode_p1 * float(miss_streak)));
}
balance -= u_cost;
spins_done = i + 1u;
float r1 = rng_uniform(s);
if (r1 < p_eff) {
float u = rng_uniform(s) * total_q;
float acc = 0.0;
uint chosen = u_n_tiers - 1u;
for (uint k = 0u; k < u_n_tiers; ++k) {
acc += tiers[k].x;
if (u < acc) { chosen = k; break; }
}
float mult = tiers[chosen].y;
balance += mult * u_cost;
wins += 1u;
if (chosen < uint(MAX_TIERS)) local_counts[chosen] += 1u;
miss_streak = 0u;
} else {
miss_streak += 1u;
if (miss_streak > longest_miss) longest_miss = miss_streak;
}
if (balance > peak) peak = balance;
if (balance < trough) trough = balance;
float dd = peak - balance;
if (dd > max_dd) max_dd = dd;
}
rng_seeds[sid] = s;
uint base = sid * 8u;
summary[base + 0u] = balance;
summary[base + 1u] = balance - u_start_balance; // pnl
summary[base + 2u] = max_dd;
summary[base + 3u] = peak;
summary[base + 4u] = trough;
summary[base + 5u] = float(spins_done);
summary[base + 6u] = float(wins);
summary[base + 7u] = float(longest_miss);
uint cbase = sid * uint(MAX_TIERS);
for (int k = 0; k < MAX_TIERS; ++k) {
tier_counts[cbase + uint(k)] = local_counts[k];
}
// Avoid unused-but-set warnings on inv_total if a future variant uses it.
(void)(inv_total);
}
)glsl";
McSessionSim mc_session_sim_create(int n_sessions, int max_tiers) {
McSessionSim s{};
if (n_sessions <= 0) return s;
if (max_tiers <= 0 || max_tiers > kMaxTiersHard) max_tiers = kMaxTiersHard;
auto rng = fn::gfx::glsl_rng_preamble(kSeedBinding);
auto r = fn::gfx::compile_compute(k_body, 64, rng);
if (!r.ok) {
std::fprintf(stderr, "[mc_session_sim_gpu] compile error: %s\n",
r.err_msg.c_str());
return s;
}
s.program = r.program;
s.loc_n_sessions = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_sessions"));
s.loc_n_spins = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_spins"));
s.loc_n_tiers = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_tiers"));
s.loc_p_base = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_p_base"));
s.loc_cost = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_cost"));
s.loc_start_bal = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_start_balance"));
s.loc_mode = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode"));
s.loc_mode_p1 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p1"));
s.loc_mode_p2 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p2"));
s.n_sessions = n_sessions;
s.max_tiers = max_tiers;
s.summary = fn::gfx::ssbo_create(
static_cast<std::size_t>(n_sessions) * 8u * sizeof(float),
nullptr, GL_DYNAMIC_COPY);
s.tier_counts = fn::gfx::ssbo_create(
static_cast<std::size_t>(n_sessions) *
static_cast<std::size_t>(max_tiers) * sizeof(unsigned int),
nullptr, GL_DYNAMIC_COPY);
s.tiers = fn::gfx::ssbo_create(
static_cast<std::size_t>(max_tiers) * 2u * sizeof(float),
nullptr, GL_DYNAMIC_DRAW);
s.rng_seeds = fn::gfx::ssbo_create(
static_cast<std::size_t>(n_sessions) * sizeof(unsigned int),
nullptr, GL_DYNAMIC_COPY);
return s;
}
void mc_session_sim_reseed(McSessionSim& s, unsigned long long master_seed) {
if (s.rng_seeds.id == 0 || s.n_sessions <= 0) return;
std::vector<unsigned int> seeds(s.n_sessions);
fn::gfx::seed_walkers_init(master_seed, seeds.data(), s.n_sessions);
fn::gfx::ssbo_upload(s.rng_seeds, 0,
static_cast<std::size_t>(s.n_sessions) * sizeof(unsigned int),
seeds.data());
}
void mc_session_sim_run(McSessionSim& s, const McSessionParams& p) {
if (s.program == 0 || s.n_sessions <= 0) return;
if (p.n_tiers <= 0 || p.tiers_q == nullptr || p.tiers_m == nullptr) return;
int nt = p.n_tiers;
if (nt > s.max_tiers) nt = s.max_tiers;
// Pack tiers como vec2 (q, m) — std430 vec2 = 8 bytes alineado.
std::vector<float> packed(static_cast<std::size_t>(s.max_tiers) * 2u, 0.0f);
for (int i = 0; i < nt; ++i) {
packed[2 * i + 0] = p.tiers_q[i];
packed[2 * i + 1] = p.tiers_m[i];
}
fn::gfx::ssbo_upload(s.tiers, 0,
packed.size() * sizeof(float),
packed.data());
glUseProgram(s.program);
fn::gfx::ssbo_bind(s.summary, 0);
fn::gfx::ssbo_bind(s.tier_counts, 1);
fn::gfx::ssbo_bind(s.tiers, 2);
fn::gfx::ssbo_bind(s.rng_seeds, kSeedBinding);
glUniform1ui(static_cast<GLint>(s.loc_n_sessions), static_cast<GLuint>(s.n_sessions));
glUniform1ui(static_cast<GLint>(s.loc_n_spins), static_cast<GLuint>(p.n_spins));
glUniform1ui(static_cast<GLint>(s.loc_n_tiers), static_cast<GLuint>(nt));
glUniform1f (static_cast<GLint>(s.loc_p_base), p.p_base);
glUniform1f (static_cast<GLint>(s.loc_cost), p.cost);
glUniform1f (static_cast<GLint>(s.loc_start_bal), p.start_balance);
glUniform1i (static_cast<GLint>(s.loc_mode), static_cast<int>(p.mode));
glUniform1f (static_cast<GLint>(s.loc_mode_p1), p.mode_p1);
glUniform1f (static_cast<GLint>(s.loc_mode_p2), p.mode_p2);
fn::gfx::dispatch_1d(s.n_sessions, 64);
fn::gfx::barrier_buffer_update();
}
void mc_session_sim_readback_summary(const McSessionSim& s, float* out) {
if (s.summary.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
fn::gfx::ssbo_readback(
s.summary, 0,
static_cast<std::size_t>(s.n_sessions) * 8u * sizeof(float),
out);
}
void mc_session_sim_readback_tier_counts(const McSessionSim& s,
unsigned int* out) {
if (s.tier_counts.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
fn::gfx::ssbo_readback(
s.tier_counts, 0,
static_cast<std::size_t>(s.n_sessions) *
static_cast<std::size_t>(s.max_tiers) * sizeof(unsigned int),
out);
}
void mc_session_sim_destroy(McSessionSim& s) {
fn::gfx::delete_compute_program(s.program);
s.program = 0;
fn::gfx::ssbo_destroy(s.summary);
fn::gfx::ssbo_destroy(s.tier_counts);
fn::gfx::ssbo_destroy(s.tiers);
fn::gfx::ssbo_destroy(s.rng_seeds);
s.n_sessions = 0;
}
} // namespace fn::ds