7b0384c804
Tres kernels Monte Carlo intensivos sobre las primitivas G1-G7 + las puras CPU como oraculo de tests numericos. Apuntados a hyper-paralelizar los calculadores de sources/calculadoras (vr_tiered_lab, mcmc-bayes / full / lab, mcmc-visualizer) en RTX-class GPUs. - mc_session_sim_gpu (K1): N sesiones independientes de K spins en paralelo (1 thread = 1 sesion). Modelo variable-ratio escalonado con tiers (q, m), modes Pure/Pity/Streak, miss_streak, drawdown. SSBOs summary[N*8] y tier_counts[N*max_tiers]. Portea vr_tiered_lab. - mc_metropolis_hastings_gpu (K2): M cadenas independientes 1D. Target log-pdf inyectable como string GLSL (mismo patron de gl_shader). u_user[16] para cambiar parametros desde sliders sin recompilar. Output compatible con rhat_split / ess_basic. - mc_random_walk_2d_gpu (K3): walkers 2D MH con trace_xy xy-interleaved en SSBO; pasable directamente a gpu_histogram_2d sin readback intermedio. Pipeline GPU-only para mcmc-visualizer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
247 lines
8.9 KiB
C++
247 lines
8.9 KiB
C++
#include "datascience/mc_session_sim_gpu.h"
|
|
#include "gfx/gl_loader.h"
|
|
#include "gfx/gpu_compute_program.h"
|
|
#include "gfx/gpu_dispatch.h"
|
|
#include "gfx/gpu_rng_glsl.h"
|
|
|
|
#include <cstdio>
|
|
#include <vector>
|
|
|
|
namespace fn::ds {
|
|
|
|
constexpr int kMaxTiersHard = 8;
|
|
constexpr int kSeedBinding = 9;
|
|
|
|
// 1 thread = 1 sesion completa. Loops internamente sobre los spins. Como
|
|
// cada sesion es independiente y cada thread tiene su seed, no hace falta
|
|
// ningun atomic ni barrier dentro del workgroup. La unica restriccion es
|
|
// que cada thread procesa hasta MAX_TIERS_HARD = 8 contadores locales en
|
|
// registros (uint local_counts[8]).
|
|
static const char* k_body = R"glsl(
|
|
#define MAX_TIERS 8
|
|
|
|
layout(std430, binding = 0) buffer Summaries { float summary[]; };
|
|
layout(std430, binding = 1) buffer TierCounts { uint tier_counts[]; };
|
|
layout(std430, binding = 2) buffer Tiers { vec2 tiers[]; };
|
|
|
|
uniform uint u_n_sessions;
|
|
uniform uint u_n_spins;
|
|
uniform uint u_n_tiers;
|
|
uniform float u_p_base;
|
|
uniform float u_cost;
|
|
uniform float u_start_balance;
|
|
uniform int u_mode;
|
|
uniform float u_mode_p1;
|
|
uniform float u_mode_p2;
|
|
|
|
void main() {
|
|
uint sid = gl_GlobalInvocationID.x;
|
|
if (sid >= u_n_sessions) return;
|
|
|
|
uint s = rng_seeds[sid];
|
|
float balance = u_start_balance;
|
|
float peak = balance;
|
|
float trough = balance;
|
|
float max_dd = 0.0;
|
|
uint spins_done = 0u;
|
|
uint wins = 0u;
|
|
uint miss_streak = 0u;
|
|
uint longest_miss = 0u;
|
|
|
|
uint local_counts[MAX_TIERS];
|
|
for (int k = 0; k < MAX_TIERS; ++k) local_counts[k] = 0u;
|
|
|
|
float total_q = 0.0;
|
|
for (uint k = 0u; k < u_n_tiers; ++k) total_q += tiers[k].x;
|
|
float inv_total = (total_q > 0.0) ? (1.0 / total_q) : 0.0;
|
|
|
|
for (uint i = 0u; i < u_n_spins; ++i) {
|
|
if (balance < u_cost) break;
|
|
|
|
// Effective probability (modes Pure / Pity / Streak)
|
|
float p_eff = u_p_base;
|
|
if (u_mode == 1) {
|
|
float ms = float(miss_streak);
|
|
float soft = u_mode_p1;
|
|
float hard = u_mode_p2;
|
|
if (ms < soft) {
|
|
p_eff = u_p_base;
|
|
} else if (ms >= hard) {
|
|
p_eff = 1.0;
|
|
} else {
|
|
p_eff = u_p_base + (1.0 - u_p_base) *
|
|
((ms - soft) / max(hard - soft, 1.0));
|
|
}
|
|
} else if (u_mode == 2) {
|
|
p_eff = min(1.0, u_p_base * (1.0 + u_mode_p1 * float(miss_streak)));
|
|
}
|
|
|
|
balance -= u_cost;
|
|
spins_done = i + 1u;
|
|
|
|
float r1 = rng_uniform(s);
|
|
if (r1 < p_eff) {
|
|
float u = rng_uniform(s) * total_q;
|
|
float acc = 0.0;
|
|
uint chosen = u_n_tiers - 1u;
|
|
for (uint k = 0u; k < u_n_tiers; ++k) {
|
|
acc += tiers[k].x;
|
|
if (u < acc) { chosen = k; break; }
|
|
}
|
|
float mult = tiers[chosen].y;
|
|
balance += mult * u_cost;
|
|
wins += 1u;
|
|
if (chosen < uint(MAX_TIERS)) local_counts[chosen] += 1u;
|
|
miss_streak = 0u;
|
|
} else {
|
|
miss_streak += 1u;
|
|
if (miss_streak > longest_miss) longest_miss = miss_streak;
|
|
}
|
|
|
|
if (balance > peak) peak = balance;
|
|
if (balance < trough) trough = balance;
|
|
float dd = peak - balance;
|
|
if (dd > max_dd) max_dd = dd;
|
|
}
|
|
|
|
rng_seeds[sid] = s;
|
|
|
|
uint base = sid * 8u;
|
|
summary[base + 0u] = balance;
|
|
summary[base + 1u] = balance - u_start_balance; // pnl
|
|
summary[base + 2u] = max_dd;
|
|
summary[base + 3u] = peak;
|
|
summary[base + 4u] = trough;
|
|
summary[base + 5u] = float(spins_done);
|
|
summary[base + 6u] = float(wins);
|
|
summary[base + 7u] = float(longest_miss);
|
|
|
|
uint cbase = sid * uint(MAX_TIERS);
|
|
for (int k = 0; k < MAX_TIERS; ++k) {
|
|
tier_counts[cbase + uint(k)] = local_counts[k];
|
|
}
|
|
|
|
// Avoid unused-but-set warnings on inv_total if a future variant uses it.
|
|
(void)(inv_total);
|
|
}
|
|
)glsl";
|
|
|
|
McSessionSim mc_session_sim_create(int n_sessions, int max_tiers) {
|
|
McSessionSim s{};
|
|
if (n_sessions <= 0) return s;
|
|
if (max_tiers <= 0 || max_tiers > kMaxTiersHard) max_tiers = kMaxTiersHard;
|
|
|
|
auto rng = fn::gfx::glsl_rng_preamble(kSeedBinding);
|
|
auto r = fn::gfx::compile_compute(k_body, 64, rng);
|
|
if (!r.ok) {
|
|
std::fprintf(stderr, "[mc_session_sim_gpu] compile error: %s\n",
|
|
r.err_msg.c_str());
|
|
return s;
|
|
}
|
|
s.program = r.program;
|
|
|
|
s.loc_n_sessions = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_sessions"));
|
|
s.loc_n_spins = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_spins"));
|
|
s.loc_n_tiers = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_tiers"));
|
|
s.loc_p_base = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_p_base"));
|
|
s.loc_cost = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_cost"));
|
|
s.loc_start_bal = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_start_balance"));
|
|
s.loc_mode = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode"));
|
|
s.loc_mode_p1 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p1"));
|
|
s.loc_mode_p2 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p2"));
|
|
|
|
s.n_sessions = n_sessions;
|
|
s.max_tiers = max_tiers;
|
|
|
|
s.summary = fn::gfx::ssbo_create(
|
|
static_cast<std::size_t>(n_sessions) * 8u * sizeof(float),
|
|
nullptr, GL_DYNAMIC_COPY);
|
|
s.tier_counts = fn::gfx::ssbo_create(
|
|
static_cast<std::size_t>(n_sessions) *
|
|
static_cast<std::size_t>(max_tiers) * sizeof(unsigned int),
|
|
nullptr, GL_DYNAMIC_COPY);
|
|
s.tiers = fn::gfx::ssbo_create(
|
|
static_cast<std::size_t>(max_tiers) * 2u * sizeof(float),
|
|
nullptr, GL_DYNAMIC_DRAW);
|
|
s.rng_seeds = fn::gfx::ssbo_create(
|
|
static_cast<std::size_t>(n_sessions) * sizeof(unsigned int),
|
|
nullptr, GL_DYNAMIC_COPY);
|
|
return s;
|
|
}
|
|
|
|
void mc_session_sim_reseed(McSessionSim& s, unsigned long long master_seed) {
|
|
if (s.rng_seeds.id == 0 || s.n_sessions <= 0) return;
|
|
std::vector<unsigned int> seeds(s.n_sessions);
|
|
fn::gfx::seed_walkers_init(master_seed, seeds.data(), s.n_sessions);
|
|
fn::gfx::ssbo_upload(s.rng_seeds, 0,
|
|
static_cast<std::size_t>(s.n_sessions) * sizeof(unsigned int),
|
|
seeds.data());
|
|
}
|
|
|
|
void mc_session_sim_run(McSessionSim& s, const McSessionParams& p) {
|
|
if (s.program == 0 || s.n_sessions <= 0) return;
|
|
if (p.n_tiers <= 0 || p.tiers_q == nullptr || p.tiers_m == nullptr) return;
|
|
|
|
int nt = p.n_tiers;
|
|
if (nt > s.max_tiers) nt = s.max_tiers;
|
|
|
|
// Pack tiers como vec2 (q, m) — std430 vec2 = 8 bytes alineado.
|
|
std::vector<float> packed(static_cast<std::size_t>(s.max_tiers) * 2u, 0.0f);
|
|
for (int i = 0; i < nt; ++i) {
|
|
packed[2 * i + 0] = p.tiers_q[i];
|
|
packed[2 * i + 1] = p.tiers_m[i];
|
|
}
|
|
fn::gfx::ssbo_upload(s.tiers, 0,
|
|
packed.size() * sizeof(float),
|
|
packed.data());
|
|
|
|
glUseProgram(s.program);
|
|
fn::gfx::ssbo_bind(s.summary, 0);
|
|
fn::gfx::ssbo_bind(s.tier_counts, 1);
|
|
fn::gfx::ssbo_bind(s.tiers, 2);
|
|
fn::gfx::ssbo_bind(s.rng_seeds, kSeedBinding);
|
|
|
|
glUniform1ui(static_cast<GLint>(s.loc_n_sessions), static_cast<GLuint>(s.n_sessions));
|
|
glUniform1ui(static_cast<GLint>(s.loc_n_spins), static_cast<GLuint>(p.n_spins));
|
|
glUniform1ui(static_cast<GLint>(s.loc_n_tiers), static_cast<GLuint>(nt));
|
|
glUniform1f (static_cast<GLint>(s.loc_p_base), p.p_base);
|
|
glUniform1f (static_cast<GLint>(s.loc_cost), p.cost);
|
|
glUniform1f (static_cast<GLint>(s.loc_start_bal), p.start_balance);
|
|
glUniform1i (static_cast<GLint>(s.loc_mode), static_cast<int>(p.mode));
|
|
glUniform1f (static_cast<GLint>(s.loc_mode_p1), p.mode_p1);
|
|
glUniform1f (static_cast<GLint>(s.loc_mode_p2), p.mode_p2);
|
|
|
|
fn::gfx::dispatch_1d(s.n_sessions, 64);
|
|
fn::gfx::barrier_buffer_update();
|
|
}
|
|
|
|
void mc_session_sim_readback_summary(const McSessionSim& s, float* out) {
|
|
if (s.summary.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
|
|
fn::gfx::ssbo_readback(
|
|
s.summary, 0,
|
|
static_cast<std::size_t>(s.n_sessions) * 8u * sizeof(float),
|
|
out);
|
|
}
|
|
|
|
void mc_session_sim_readback_tier_counts(const McSessionSim& s,
|
|
unsigned int* out) {
|
|
if (s.tier_counts.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
|
|
fn::gfx::ssbo_readback(
|
|
s.tier_counts, 0,
|
|
static_cast<std::size_t>(s.n_sessions) *
|
|
static_cast<std::size_t>(s.max_tiers) * sizeof(unsigned int),
|
|
out);
|
|
}
|
|
|
|
void mc_session_sim_destroy(McSessionSim& s) {
|
|
fn::gfx::delete_compute_program(s.program);
|
|
s.program = 0;
|
|
fn::gfx::ssbo_destroy(s.summary);
|
|
fn::gfx::ssbo_destroy(s.tier_counts);
|
|
fn::gfx::ssbo_destroy(s.tiers);
|
|
fn::gfx::ssbo_destroy(s.rng_seeds);
|
|
s.n_sessions = 0;
|
|
}
|
|
|
|
} // namespace fn::ds
|