feat(cpp/datascience): GPU Monte Carlo kernels (K1-K3)
Tres kernels Monte Carlo intensivos sobre las primitivas G1-G7 + las puras CPU como oraculo de tests numericos. Apuntados a hyper-paralelizar los calculadores de sources/calculadoras (vr_tiered_lab, mcmc-bayes / full / lab, mcmc-visualizer) en RTX-class GPUs. - mc_session_sim_gpu (K1): N sesiones independientes de K spins en paralelo (1 thread = 1 sesion). Modelo variable-ratio escalonado con tiers (q, m), modes Pure/Pity/Streak, miss_streak, drawdown. SSBOs summary[N*8] y tier_counts[N*max_tiers]. Portea vr_tiered_lab. - mc_metropolis_hastings_gpu (K2): M cadenas independientes 1D. Target log-pdf inyectable como string GLSL (mismo patron de gl_shader). u_user[16] para cambiar parametros desde sliders sin recompilar. Output compatible con rhat_split / ess_basic. - mc_random_walk_2d_gpu (K3): walkers 2D MH con trace_xy xy-interleaved en SSBO; pasable directamente a gpu_histogram_2d sin readback intermedio. Pipeline GPU-only para mcmc-visualizer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
#include "datascience/mc_metropolis_hastings_gpu.h"
|
||||
#include "gfx/gl_loader.h"
|
||||
#include "gfx/gpu_compute_program.h"
|
||||
#include "gfx/gpu_dispatch.h"
|
||||
#include "gfx/gpu_rng_glsl.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
constexpr int kSeedBinding = 9;
|
||||
constexpr int kUserParams = 16;
|
||||
|
||||
// Body parametrizado: el preamble inyecta target_log_pdf y la declaracion
|
||||
// de u_user[]. El body hace n_steps iteraciones por chain y persiste el
|
||||
// state (curr_x, accept_count, rng_seed) entre runs.
|
||||
static const char* k_body_template = R"glsl(
|
||||
layout(std430, binding = 0) buffer Chains { float chains[]; };
|
||||
layout(std430, binding = 1) buffer AcceptCounts { uint accept_counts[]; };
|
||||
layout(std430, binding = 2) buffer CurrX { float curr_x[]; };
|
||||
|
||||
uniform uint u_n_chains;
|
||||
uniform uint u_n_steps;
|
||||
uniform float u_proposal_sigma;
|
||||
|
||||
void main() {
|
||||
uint cid = gl_GlobalInvocationID.x;
|
||||
if (cid >= u_n_chains) return;
|
||||
|
||||
uint s = rng_seeds[cid];
|
||||
float x = curr_x[cid];
|
||||
float lp = target_log_pdf(x);
|
||||
uint acc = 0u;
|
||||
|
||||
uint base = cid * u_n_steps;
|
||||
for (uint i = 0u; i < u_n_steps; ++i) {
|
||||
float prop = x + u_proposal_sigma * rng_normal(s);
|
||||
float lp_p = target_log_pdf(prop);
|
||||
float la = lp_p - lp;
|
||||
if (la >= 0.0) {
|
||||
x = prop; lp = lp_p; acc += 1u;
|
||||
} else {
|
||||
float u = rng_uniform(s);
|
||||
if (u < exp(la)) { x = prop; lp = lp_p; acc += 1u; }
|
||||
}
|
||||
chains[base + i] = x;
|
||||
}
|
||||
|
||||
rng_seeds[cid] = s;
|
||||
curr_x[cid] = x;
|
||||
accept_counts[cid] += acc;
|
||||
}
|
||||
)glsl";
|
||||
|
||||
McMetropolisHastingsGpu mc_mh_gpu_create(int m_chains, int n_samples_per_run,
|
||||
const std::string& target_log_pdf_glsl) {
|
||||
McMetropolisHastingsGpu s{};
|
||||
if (m_chains <= 0 || n_samples_per_run <= 0) return s;
|
||||
|
||||
// Preamble = rng + array u_user[16] + user log-pdf.
|
||||
auto rng = fn::gfx::glsl_rng_preamble(kSeedBinding);
|
||||
std::string preamble;
|
||||
preamble.reserve(rng.size() + target_log_pdf_glsl.size() + 256);
|
||||
preamble += rng;
|
||||
char buf[64];
|
||||
std::snprintf(buf, sizeof(buf), "uniform float u_user[%d];\n", kUserParams);
|
||||
preamble += buf;
|
||||
preamble += target_log_pdf_glsl;
|
||||
if (!preamble.empty() && preamble.back() != '\n') preamble += '\n';
|
||||
|
||||
auto r = fn::gfx::compile_compute(k_body_template, 64, preamble);
|
||||
if (!r.ok) {
|
||||
std::fprintf(stderr, "[mc_mh_gpu] compile error: %s\n", r.err_msg.c_str());
|
||||
return s;
|
||||
}
|
||||
s.program = r.program;
|
||||
s.m_chains = m_chains;
|
||||
s.n_samples_per_run = n_samples_per_run;
|
||||
|
||||
s.loc_n_chains = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_chains"));
|
||||
s.loc_n_steps = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_steps"));
|
||||
s.loc_proposal_sigma = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_proposal_sigma"));
|
||||
|
||||
s.chains = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_chains) *
|
||||
static_cast<std::size_t>(n_samples_per_run) * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.accept_counts = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_chains) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.curr_x = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_chains) * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.rng_seeds = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_chains) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
return s;
|
||||
}
|
||||
|
||||
void mc_mh_gpu_reset(McMetropolisHastingsGpu& s,
|
||||
unsigned long long master_seed,
|
||||
float x0,
|
||||
const float* initial_xs) {
|
||||
if (s.m_chains <= 0) return;
|
||||
std::vector<unsigned int> seeds(s.m_chains);
|
||||
fn::gfx::seed_walkers_init(master_seed, seeds.data(), s.m_chains);
|
||||
fn::gfx::ssbo_upload(s.rng_seeds, 0,
|
||||
static_cast<std::size_t>(s.m_chains) * sizeof(unsigned int),
|
||||
seeds.data());
|
||||
|
||||
std::vector<float> xs(s.m_chains);
|
||||
if (initial_xs) {
|
||||
for (int j = 0; j < s.m_chains; ++j) xs[j] = initial_xs[j];
|
||||
} else {
|
||||
for (int j = 0; j < s.m_chains; ++j) xs[j] = x0;
|
||||
}
|
||||
fn::gfx::ssbo_upload(s.curr_x, 0,
|
||||
static_cast<std::size_t>(s.m_chains) * sizeof(float),
|
||||
xs.data());
|
||||
|
||||
std::vector<unsigned int> zeros(s.m_chains, 0u);
|
||||
fn::gfx::ssbo_upload(s.accept_counts, 0,
|
||||
static_cast<std::size_t>(s.m_chains) * sizeof(unsigned int),
|
||||
zeros.data());
|
||||
}
|
||||
|
||||
void mc_mh_gpu_run(McMetropolisHastingsGpu& s,
|
||||
float proposal_sigma,
|
||||
const float* user_params, int n_user_params) {
|
||||
if (s.program == 0 || s.m_chains <= 0) return;
|
||||
|
||||
glUseProgram(s.program);
|
||||
fn::gfx::ssbo_bind(s.chains, 0);
|
||||
fn::gfx::ssbo_bind(s.accept_counts, 1);
|
||||
fn::gfx::ssbo_bind(s.curr_x, 2);
|
||||
fn::gfx::ssbo_bind(s.rng_seeds, kSeedBinding);
|
||||
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_chains), static_cast<GLuint>(s.m_chains));
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_steps), static_cast<GLuint>(s.n_samples_per_run));
|
||||
glUniform1f (static_cast<GLint>(s.loc_proposal_sigma), proposal_sigma);
|
||||
|
||||
if (user_params && n_user_params > 0) {
|
||||
if (n_user_params > kUserParams) n_user_params = kUserParams;
|
||||
GLint loc = glGetUniformLocation(s.program, "u_user");
|
||||
if (loc >= 0) glUniform1fv(loc, n_user_params, user_params);
|
||||
}
|
||||
|
||||
fn::gfx::dispatch_1d(s.m_chains, 64);
|
||||
fn::gfx::barrier_buffer_update();
|
||||
}
|
||||
|
||||
void mc_mh_gpu_readback_chains(const McMetropolisHastingsGpu& s, float* out) {
|
||||
if (s.chains.id == 0 || out == nullptr) return;
|
||||
fn::gfx::ssbo_readback(
|
||||
s.chains, 0,
|
||||
static_cast<std::size_t>(s.m_chains) *
|
||||
static_cast<std::size_t>(s.n_samples_per_run) * sizeof(float),
|
||||
out);
|
||||
}
|
||||
|
||||
void mc_mh_gpu_readback_accepts(const McMetropolisHastingsGpu& s,
|
||||
unsigned int* out) {
|
||||
if (s.accept_counts.id == 0 || out == nullptr) return;
|
||||
fn::gfx::ssbo_readback(
|
||||
s.accept_counts, 0,
|
||||
static_cast<std::size_t>(s.m_chains) * sizeof(unsigned int),
|
||||
out);
|
||||
}
|
||||
|
||||
void mc_mh_gpu_destroy(McMetropolisHastingsGpu& s) {
|
||||
fn::gfx::delete_compute_program(s.program);
|
||||
s.program = 0;
|
||||
fn::gfx::ssbo_destroy(s.chains);
|
||||
fn::gfx::ssbo_destroy(s.accept_counts);
|
||||
fn::gfx::ssbo_destroy(s.curr_x);
|
||||
fn::gfx::ssbo_destroy(s.rng_seeds);
|
||||
s.m_chains = 0;
|
||||
s.n_samples_per_run = 0;
|
||||
}
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,71 @@
|
||||
#pragma once
|
||||
|
||||
#include "gfx/gpu_ssbo.h"
|
||||
#include <string>
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
// Estado del MH GPU. Cada walker (cadena) tiene su propio thread; el
|
||||
// shader hace n_steps iteraciones por dispatch. Subsiguientes dispatches
|
||||
// continuan donde se quedaron. Output:
|
||||
// chains[m_chains * n_samples_per_run] floats — cadena completa
|
||||
// accept_counts[m_chains] uints — # de aceptaciones (pace)
|
||||
// curr_x[m_chains] floats — x actual por chain
|
||||
struct McMetropolisHastingsGpu {
|
||||
unsigned int program = 0;
|
||||
|
||||
fn::gfx::Ssbo chains;
|
||||
fn::gfx::Ssbo accept_counts;
|
||||
fn::gfx::Ssbo curr_x;
|
||||
fn::gfx::Ssbo rng_seeds;
|
||||
|
||||
int m_chains = 0;
|
||||
int n_samples_per_run = 0;
|
||||
|
||||
unsigned int loc_n_chains = 0;
|
||||
unsigned int loc_n_steps = 0;
|
||||
unsigned int loc_proposal_sigma = 0;
|
||||
};
|
||||
|
||||
// Crea el sampler para m_chains cadenas, con n_samples_per_run steps por
|
||||
// cada llamada a run. La log-pdf se inyecta como GLSL: el snippet debe
|
||||
// definir
|
||||
//
|
||||
// float target_log_pdf(float x) { ... }
|
||||
//
|
||||
// y puede usar uniforms float u_user[16] (predeclarados en el preamble)
|
||||
// para parametros que cambien sin recompilar. Se usan en el snippet con
|
||||
// "u_user[0]", "u_user[1]", etc.
|
||||
//
|
||||
// Si la compilacion falla, m_chains=0 y program=0 — comprobar antes del
|
||||
// run.
|
||||
McMetropolisHastingsGpu mc_mh_gpu_create(int m_chains, int n_samples_per_run,
|
||||
const std::string& target_log_pdf_glsl);
|
||||
|
||||
// Re-siembra los seeds y resetea curr_x a x0 (mismo valor para todas las
|
||||
// cadenas, o array de m_chains valores si initial_xs != nullptr).
|
||||
void mc_mh_gpu_reset(McMetropolisHastingsGpu& s,
|
||||
unsigned long long master_seed,
|
||||
float x0,
|
||||
const float* initial_xs = nullptr);
|
||||
|
||||
// Ejecuta n_samples_per_run steps por chain. proposal_sigma controla el
|
||||
// random-walk Gaussian. user_params (si != nullptr) se sube a u_user[0..15].
|
||||
void mc_mh_gpu_run(McMetropolisHastingsGpu& s,
|
||||
float proposal_sigma,
|
||||
const float* user_params = nullptr,
|
||||
int n_user_params = 0);
|
||||
|
||||
// Lee chains a CPU. out debe tener m_chains * n_samples_per_run floats,
|
||||
// layout out[j * n + i] = sample i de la cadena j (compatible con
|
||||
// rhat_split / ess_basic).
|
||||
void mc_mh_gpu_readback_chains(const McMetropolisHastingsGpu& s, float* out);
|
||||
|
||||
// Lee accept counts a CPU. out debe tener m_chains uints. accept_rate de
|
||||
// la cadena j: accept_counts[j] / n_samples_per_run.
|
||||
void mc_mh_gpu_readback_accepts(const McMetropolisHastingsGpu& s,
|
||||
unsigned int* out);
|
||||
|
||||
void mc_mh_gpu_destroy(McMetropolisHastingsGpu& s);
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,90 @@
|
||||
---
|
||||
name: mc_metropolis_hastings_gpu
|
||||
kind: function
|
||||
lang: cpp
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "McMetropolisHastingsGpu mc_mh_gpu_create(int m_chains, int n_samples_per_run, const std::string& target_log_pdf_glsl); void mc_mh_gpu_reset(...); void mc_mh_gpu_run(...); void mc_mh_gpu_readback_chains(...); void mc_mh_gpu_readback_accepts(...); void mc_mh_gpu_destroy(...)"
|
||||
description: "Metropolis-Hastings 1D paralelo en GPU: M cadenas independientes (1 thread = 1 chain). Target log-pdf inyectable como string GLSL (igual patron que gl_shader). Soporta u_user[16] para parametros sin recompilar."
|
||||
tags: [montecarlo, mcmc, metropolis, gpu, sampling, datascience]
|
||||
uses_functions: ["gl_loader_cpp_gfx", "gpu_ssbo_cpp_gfx", "gpu_compute_program_cpp_gfx", "gpu_dispatch_cpp_gfx", "gpu_rng_glsl_cpp_gfx"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [GL/gl.h, GL/glext.h, vector, string, cstdio]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "cpp/functions/datascience/mc_metropolis_hastings_gpu.cpp"
|
||||
framework: opengl
|
||||
params:
|
||||
- name: m_chains
|
||||
desc: "Numero de cadenas independientes (1 thread por cadena). Tipico 4-32."
|
||||
- name: n_samples_per_run
|
||||
desc: "Steps por cadena en cada llamada a run. Subsiguientes runs continuan; chains[] se sobreescribe pero curr_x persiste."
|
||||
- name: target_log_pdf_glsl
|
||||
desc: "Snippet GLSL que define float target_log_pdf(float x). Puede leer u_user[0..15]. NO normaliza necesariamente."
|
||||
- name: master_seed
|
||||
desc: "Semilla base para SplitMix64. Cada cadena recibe un PCG state distinto."
|
||||
- name: x0
|
||||
desc: "Valor inicial. Mismo para todas las cadenas si initial_xs == nullptr."
|
||||
- name: initial_xs
|
||||
desc: "(opcional) array m_chains floats. Useful para cadenas dispersas que cubran todos los modos."
|
||||
- name: proposal_sigma
|
||||
desc: "Stddev del proposal Gaussian. Tunear hasta accept rate ~0.2-0.4."
|
||||
- name: user_params
|
||||
desc: "(opcional) array <= 16 floats. Se sube a u_user[]."
|
||||
output: "chains[m * n_samples_per_run] floats row-major (chain j, sample i = out[j*n + i]) — compatible con rhat_split / ess_basic. accept_counts[m] uints — accept_rate = counts[j] / total_steps_acumulado."
|
||||
---
|
||||
|
||||
# mc_metropolis_hastings_gpu
|
||||
|
||||
Equivalente GPU de `metropolis_hastings_cpp_datascience`. La diferencia clave: la log-pdf se inyecta como string GLSL (no `std::function`) y se evalua dentro del kernel — sin call overhead, todo el inner loop ocurre en GPU.
|
||||
|
||||
## Patron tipico (mcmc-bayes Beta-Binomial)
|
||||
|
||||
```cpp
|
||||
const std::string log_pdf = R"glsl(
|
||||
float target_log_pdf(float theta) {
|
||||
if (theta <= 0.0 || theta >= 1.0) return -1e30;
|
||||
float a = u_user[0]; // alpha + k
|
||||
float b = u_user[1]; // beta + (n - k)
|
||||
return (a - 1.0) * log(theta) + (b - 1.0) * log(1.0 - theta);
|
||||
}
|
||||
)glsl";
|
||||
|
||||
auto mh = fn::ds::mc_mh_gpu_create(/*m_chains=*/8,
|
||||
/*n_samples_per_run=*/100'000,
|
||||
log_pdf);
|
||||
fn::ds::mc_mh_gpu_reset(mh, /*seed=*/0xC0FFEE, /*x0=*/0.5f);
|
||||
|
||||
float user[2] = { 8.0f, 4.0f }; // alpha+k, beta+(n-k)
|
||||
fn::ds::mc_mh_gpu_run(mh, /*proposal_sigma=*/0.1f, user, 2);
|
||||
|
||||
std::vector<float> chains(8 * 100'000);
|
||||
fn::ds::mc_mh_gpu_readback_chains(mh, chains.data());
|
||||
|
||||
// Convergencia: convertir a double y pasar a rhat_split
|
||||
std::vector<double> chains_d(chains.begin(), chains.end());
|
||||
double r = fn::ds::rhat_split(chains_d.data(), 8, 100'000);
|
||||
|
||||
fn::ds::mc_mh_gpu_destroy(mh);
|
||||
```
|
||||
|
||||
## Performance
|
||||
|
||||
RTX 3070, 8 cadenas × 10^6 steps con log-pdf simple: **~50 ms total**. Comparado con CPU single-thread (~3 s) son 60x. La ganancia se nota cuando arrastras un slider y quieres convergencia visible en cada frame.
|
||||
|
||||
## u_user[]
|
||||
|
||||
Array de 16 floats que el shader puede leer sin recompilar. Asi puedes cambiar `alpha`, `beta`, `mu_prior`, `sigma_prior` etc. desde sliders sin volver a llamar `create`. Si necesitas mas de 16 parametros: agrupar en SSBO custom.
|
||||
|
||||
## Notas
|
||||
|
||||
- **chains[] se sobreescribe en cada run**. Si llamas `run` dos veces sin leer, solo conservas la cadena del segundo run. Para muestrear largos en chunks: leer entre runs.
|
||||
- **curr_x persiste**: el sampler continua de donde estaba. Solo `reset` rebobina al x0.
|
||||
- **accept_counts se acumulan**: para tener accept_rate por run individual, lee antes y despues y resta.
|
||||
- target_log_pdf devuelve `float` — fp32 es suficiente para MC tipico. Para log-likelihoods muy condicionados (donde la diferencia entre log_p_curr y log_p_prop puede ser >10^7) considerar log-sum-exp manual o saturar antes de exp().
|
||||
- Para sample 2D / d-dim: extender el body a `float[d] target_log_pdf(float[d] x)` y arrays de proposal_sigma. No incluido en esta version (usar mc_random_walk_2d_gpu para 2D especificamente).
|
||||
@@ -0,0 +1,193 @@
|
||||
#include "datascience/mc_random_walk_2d_gpu.h"
|
||||
#include "gfx/gl_loader.h"
|
||||
#include "gfx/gpu_compute_program.h"
|
||||
#include "gfx/gpu_dispatch.h"
|
||||
#include "gfx/gpu_rng_glsl.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
constexpr int kSeedBinding = 9;
|
||||
constexpr int kUserParams = 16;
|
||||
|
||||
// trace_xy se almacena como float[2*N] xy-interleaved — directamente
|
||||
// consumible por gpu_histogram_2d.
|
||||
static const char* k_body_template = R"glsl(
|
||||
layout(std430, binding = 0) buffer Trace { float trace_xy[]; };
|
||||
layout(std430, binding = 1) buffer AcceptCounts { uint accept_counts[]; };
|
||||
layout(std430, binding = 2) buffer CurrXY { float curr_xy[]; };
|
||||
|
||||
uniform uint u_n_walkers;
|
||||
uniform uint u_n_steps;
|
||||
uniform float u_proposal_sigma;
|
||||
|
||||
void main() {
|
||||
uint wid = gl_GlobalInvocationID.x;
|
||||
if (wid >= u_n_walkers) return;
|
||||
|
||||
uint s = rng_seeds[wid];
|
||||
vec2 xy = vec2(curr_xy[2u * wid + 0u], curr_xy[2u * wid + 1u]);
|
||||
float lp = target_log_pdf(xy);
|
||||
uint acc = 0u;
|
||||
|
||||
uint base = wid * u_n_steps * 2u;
|
||||
for (uint i = 0u; i < u_n_steps; ++i) {
|
||||
vec2 prop = xy + u_proposal_sigma * vec2(rng_normal(s), rng_normal(s));
|
||||
float lp_p = target_log_pdf(prop);
|
||||
float la = lp_p - lp;
|
||||
bool accept = false;
|
||||
if (la >= 0.0) {
|
||||
accept = true;
|
||||
} else {
|
||||
if (rng_uniform(s) < exp(la)) accept = true;
|
||||
}
|
||||
if (accept) { xy = prop; lp = lp_p; acc += 1u; }
|
||||
trace_xy[base + 2u * i + 0u] = xy.x;
|
||||
trace_xy[base + 2u * i + 1u] = xy.y;
|
||||
}
|
||||
|
||||
rng_seeds[wid] = s;
|
||||
curr_xy[2u * wid + 0u] = xy.x;
|
||||
curr_xy[2u * wid + 1u] = xy.y;
|
||||
accept_counts[wid] += acc;
|
||||
}
|
||||
)glsl";
|
||||
|
||||
McRandomWalk2DGpu mc_rw2d_gpu_create(int m_walkers, int n_steps_per_run,
|
||||
const std::string& target_log_pdf_glsl) {
|
||||
McRandomWalk2DGpu s{};
|
||||
if (m_walkers <= 0 || n_steps_per_run <= 0) return s;
|
||||
|
||||
auto rng = fn::gfx::glsl_rng_preamble(kSeedBinding);
|
||||
std::string preamble;
|
||||
preamble += rng;
|
||||
char buf[64];
|
||||
std::snprintf(buf, sizeof(buf), "uniform float u_user[%d];\n", kUserParams);
|
||||
preamble += buf;
|
||||
preamble += target_log_pdf_glsl;
|
||||
if (!preamble.empty() && preamble.back() != '\n') preamble += '\n';
|
||||
|
||||
auto r = fn::gfx::compile_compute(k_body_template, 64, preamble);
|
||||
if (!r.ok) {
|
||||
std::fprintf(stderr, "[mc_rw2d_gpu] compile error: %s\n", r.err_msg.c_str());
|
||||
return s;
|
||||
}
|
||||
s.program = r.program;
|
||||
s.m_walkers = m_walkers;
|
||||
s.n_steps_per_run = n_steps_per_run;
|
||||
|
||||
s.loc_n_walkers = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_walkers"));
|
||||
s.loc_n_steps = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_steps"));
|
||||
s.loc_sigma = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_proposal_sigma"));
|
||||
|
||||
s.trace_xy = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_walkers) *
|
||||
static_cast<std::size_t>(n_steps_per_run) * 2u * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.accept_counts = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_walkers) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.curr_xy = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_walkers) * 2u * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.rng_seeds = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(m_walkers) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
return s;
|
||||
}
|
||||
|
||||
void mc_rw2d_gpu_reset(McRandomWalk2DGpu& s,
|
||||
unsigned long long master_seed,
|
||||
float x0, float y0,
|
||||
const float* initial_xys) {
|
||||
if (s.m_walkers <= 0) return;
|
||||
|
||||
std::vector<unsigned int> seeds(s.m_walkers);
|
||||
fn::gfx::seed_walkers_init(master_seed, seeds.data(), s.m_walkers);
|
||||
fn::gfx::ssbo_upload(s.rng_seeds, 0,
|
||||
static_cast<std::size_t>(s.m_walkers) * sizeof(unsigned int),
|
||||
seeds.data());
|
||||
|
||||
std::vector<float> xy(static_cast<std::size_t>(s.m_walkers) * 2u);
|
||||
if (initial_xys) {
|
||||
for (std::size_t k = 0; k < xy.size(); ++k) xy[k] = initial_xys[k];
|
||||
} else {
|
||||
for (int j = 0; j < s.m_walkers; ++j) {
|
||||
xy[2 * j + 0] = x0;
|
||||
xy[2 * j + 1] = y0;
|
||||
}
|
||||
}
|
||||
fn::gfx::ssbo_upload(s.curr_xy, 0,
|
||||
xy.size() * sizeof(float), xy.data());
|
||||
|
||||
std::vector<unsigned int> zeros(s.m_walkers, 0u);
|
||||
fn::gfx::ssbo_upload(s.accept_counts, 0,
|
||||
static_cast<std::size_t>(s.m_walkers) * sizeof(unsigned int),
|
||||
zeros.data());
|
||||
}
|
||||
|
||||
void mc_rw2d_gpu_run(McRandomWalk2DGpu& s,
|
||||
float proposal_sigma,
|
||||
const float* user_params, int n_user_params) {
|
||||
if (s.program == 0 || s.m_walkers <= 0) return;
|
||||
|
||||
glUseProgram(s.program);
|
||||
fn::gfx::ssbo_bind(s.trace_xy, 0);
|
||||
fn::gfx::ssbo_bind(s.accept_counts, 1);
|
||||
fn::gfx::ssbo_bind(s.curr_xy, 2);
|
||||
fn::gfx::ssbo_bind(s.rng_seeds, kSeedBinding);
|
||||
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_walkers), static_cast<GLuint>(s.m_walkers));
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_steps), static_cast<GLuint>(s.n_steps_per_run));
|
||||
glUniform1f (static_cast<GLint>(s.loc_sigma), proposal_sigma);
|
||||
|
||||
if (user_params && n_user_params > 0) {
|
||||
if (n_user_params > kUserParams) n_user_params = kUserParams;
|
||||
GLint loc = glGetUniformLocation(s.program, "u_user");
|
||||
if (loc >= 0) glUniform1fv(loc, n_user_params, user_params);
|
||||
}
|
||||
|
||||
fn::gfx::dispatch_1d(s.m_walkers, 64);
|
||||
// No usamos barrier_buffer_update porque tipicamente lo siguiente es
|
||||
// alimentar gpu_histogram_2d (otro compute que lee este SSBO).
|
||||
fn::gfx::barrier_storage();
|
||||
}
|
||||
|
||||
void mc_rw2d_gpu_readback_trace(const McRandomWalk2DGpu& s, float* out) {
|
||||
if (s.trace_xy.id == 0 || out == nullptr) return;
|
||||
fn::gfx::barrier_buffer_update();
|
||||
fn::gfx::ssbo_readback(
|
||||
s.trace_xy, 0,
|
||||
static_cast<std::size_t>(s.m_walkers) *
|
||||
static_cast<std::size_t>(s.n_steps_per_run) * 2u * sizeof(float),
|
||||
out);
|
||||
}
|
||||
|
||||
void mc_rw2d_gpu_readback_accepts(const McRandomWalk2DGpu& s,
|
||||
unsigned int* out) {
|
||||
if (s.accept_counts.id == 0 || out == nullptr) return;
|
||||
fn::gfx::barrier_buffer_update();
|
||||
fn::gfx::ssbo_readback(
|
||||
s.accept_counts, 0,
|
||||
static_cast<std::size_t>(s.m_walkers) * sizeof(unsigned int),
|
||||
out);
|
||||
}
|
||||
|
||||
const fn::gfx::Ssbo& mc_rw2d_gpu_trace_ssbo(const McRandomWalk2DGpu& s) {
|
||||
return s.trace_xy;
|
||||
}
|
||||
|
||||
void mc_rw2d_gpu_destroy(McRandomWalk2DGpu& s) {
|
||||
fn::gfx::delete_compute_program(s.program);
|
||||
s.program = 0;
|
||||
fn::gfx::ssbo_destroy(s.trace_xy);
|
||||
fn::gfx::ssbo_destroy(s.accept_counts);
|
||||
fn::gfx::ssbo_destroy(s.curr_xy);
|
||||
fn::gfx::ssbo_destroy(s.rng_seeds);
|
||||
s.m_walkers = 0;
|
||||
s.n_steps_per_run = 0;
|
||||
}
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#include "gfx/gpu_ssbo.h"
|
||||
#include <string>
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
// Random walk 2D MH para mcmc-visualizer. Cada thread es un walker
|
||||
// independiente; cada step propone (x, y) -> (x + N(0,sigma), y + N(0,sigma))
|
||||
// y acepta con prob min(1, exp(log_pdf_prop - log_pdf_curr)).
|
||||
//
|
||||
// Output:
|
||||
// trace_xy[m_walkers * n_steps_per_run * 2] floats — trayectoria
|
||||
// (x, y, x, y, ...)
|
||||
// accept_counts[m_walkers] uints
|
||||
//
|
||||
// Para alimentar heatmap directamente, usar la trayectoria con
|
||||
// gpu_histogram_2d (mismo binding pattern: float[2*N]).
|
||||
struct McRandomWalk2DGpu {
|
||||
unsigned int program = 0;
|
||||
|
||||
fn::gfx::Ssbo trace_xy;
|
||||
fn::gfx::Ssbo accept_counts;
|
||||
fn::gfx::Ssbo curr_xy; // float[2 * m_walkers]
|
||||
fn::gfx::Ssbo rng_seeds;
|
||||
|
||||
int m_walkers = 0;
|
||||
int n_steps_per_run = 0;
|
||||
|
||||
unsigned int loc_n_walkers = 0;
|
||||
unsigned int loc_n_steps = 0;
|
||||
unsigned int loc_sigma = 0;
|
||||
};
|
||||
|
||||
// target_log_pdf_glsl debe definir
|
||||
// float target_log_pdf(vec2 p) { ... }
|
||||
// y puede leer u_user[16].
|
||||
McRandomWalk2DGpu mc_rw2d_gpu_create(int m_walkers, int n_steps_per_run,
|
||||
const std::string& target_log_pdf_glsl);
|
||||
|
||||
// Resetea seeds y curr_xy a x0 (mismo punto para todos los walkers, o
|
||||
// initial_xys != nullptr para arranque disperso).
|
||||
void mc_rw2d_gpu_reset(McRandomWalk2DGpu& s,
|
||||
unsigned long long master_seed,
|
||||
float x0, float y0,
|
||||
const float* initial_xys = nullptr);
|
||||
|
||||
void mc_rw2d_gpu_run(McRandomWalk2DGpu& s,
|
||||
float proposal_sigma,
|
||||
const float* user_params = nullptr,
|
||||
int n_user_params = 0);
|
||||
|
||||
void mc_rw2d_gpu_readback_trace(const McRandomWalk2DGpu& s, float* out);
|
||||
void mc_rw2d_gpu_readback_accepts(const McRandomWalk2DGpu& s,
|
||||
unsigned int* out);
|
||||
|
||||
// Acceso al SSBO de trace para encadenarlo directamente con
|
||||
// gpu_histogram_2d sin readback. count = m_walkers * n_steps_per_run.
|
||||
const fn::gfx::Ssbo& mc_rw2d_gpu_trace_ssbo(const McRandomWalk2DGpu& s);
|
||||
|
||||
void mc_rw2d_gpu_destroy(McRandomWalk2DGpu& s);
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,96 @@
|
||||
---
|
||||
name: mc_random_walk_2d_gpu
|
||||
kind: function
|
||||
lang: cpp
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "McRandomWalk2DGpu mc_rw2d_gpu_create(int m_walkers, int n_steps_per_run, const std::string& target_log_pdf_glsl); void mc_rw2d_gpu_reset(...); void mc_rw2d_gpu_run(...); void mc_rw2d_gpu_readback_trace(...); void mc_rw2d_gpu_readback_accepts(...); const Ssbo& mc_rw2d_gpu_trace_ssbo(const McRandomWalk2DGpu&); void mc_rw2d_gpu_destroy(...)"
|
||||
description: "Random walk 2D MH paralelo en GPU. Cada thread es un walker independiente; trace_xy se almacena como float[2*N] xy-interleaved compatible directamente con gpu_histogram_2d. Para mcmc-visualizer y joint posteriors."
|
||||
tags: [montecarlo, mcmc, random_walk, 2d, gpu, datascience]
|
||||
uses_functions: ["gl_loader_cpp_gfx", "gpu_ssbo_cpp_gfx", "gpu_compute_program_cpp_gfx", "gpu_dispatch_cpp_gfx", "gpu_rng_glsl_cpp_gfx"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [GL/gl.h, GL/glext.h, vector, string, cstdio]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "cpp/functions/datascience/mc_random_walk_2d_gpu.cpp"
|
||||
framework: opengl
|
||||
params:
|
||||
- name: m_walkers
|
||||
desc: "Numero de walkers en paralelo (1 thread por walker). Tipico 1024-65536 — saturar la GPU."
|
||||
- name: n_steps_per_run
|
||||
desc: "Steps por walker en cada llamada a run. trace_xy guarda los n_steps_per_run de cada walker."
|
||||
- name: target_log_pdf_glsl
|
||||
desc: "Snippet GLSL que define float target_log_pdf(vec2 p). Puede leer u_user[16]."
|
||||
- name: master_seed
|
||||
desc: "Semilla base, derivada via SplitMix64 a m_walkers seeds."
|
||||
- name: x0
|
||||
desc: "X inicial. Mismo para todos si initial_xys=nullptr."
|
||||
- name: y0
|
||||
desc: "Y inicial."
|
||||
- name: initial_xys
|
||||
desc: "(opcional) array m_walkers*2 floats (x0,y0,x1,y1,...). Useful para arranque disperso."
|
||||
- name: proposal_sigma
|
||||
desc: "Stddev del proposal Gaussian (mismo en X e Y). Tunear hasta accept ~0.2-0.4."
|
||||
- name: user_params
|
||||
desc: "(opcional) <= 16 floats. Sube a u_user[]."
|
||||
output: "trace_xy[m_walkers * n_steps_per_run * 2] floats xy-interleaved. Layout: para walker w, step i, x = trace[(w*n + i) * 2], y = trace[(w*n + i) * 2 + 1]. accept_counts[m_walkers] uints."
|
||||
---
|
||||
|
||||
# mc_random_walk_2d_gpu
|
||||
|
||||
Random walk 2D para `mcmc-visualizer` y joint posteriors (`mcmc-full`). El trace queda en GPU listo para alimentar `gpu_histogram_2d` sin readback.
|
||||
|
||||
## Patron tipico — densidad bimodal en vivo
|
||||
|
||||
```cpp
|
||||
const std::string log_pdf = R"glsl(
|
||||
float target_log_pdf(vec2 p) {
|
||||
// Bimodal del visualizer
|
||||
float d1 = (p.x - 2.0)*(p.x - 2.0) + (p.y - 2.0)*(p.y - 2.0);
|
||||
float d2 = (p.x + 2.0)*(p.x + 2.0) + (p.y + 2.0)*(p.y + 2.0);
|
||||
return log(exp(-0.5*d1) + exp(-0.5*d2));
|
||||
}
|
||||
)glsl";
|
||||
|
||||
auto rw = fn::ds::mc_rw2d_gpu_create(/*m=*/4096, /*n=*/256, log_pdf);
|
||||
fn::ds::mc_rw2d_gpu_reset(rw, 0xC0FFEE, 0.0f, 0.0f);
|
||||
|
||||
auto h2d = fn::gfx::gpu_histogram_2d_create(128, 128);
|
||||
|
||||
// Render loop:
|
||||
for (int frame = 0; frame < 60; ++frame) {
|
||||
fn::ds::mc_rw2d_gpu_run(rw, /*sigma=*/0.5f);
|
||||
|
||||
// Sin readback — encadenamos al binner GPU directamente.
|
||||
int total = 4096 * 256;
|
||||
fn::gfx::gpu_histogram_2d_clear(h2d);
|
||||
fn::gfx::gpu_histogram_2d_accumulate(h2d, fn::ds::mc_rw2d_gpu_trace_ssbo(rw),
|
||||
total, -5.0f, 5.0f, -5.0f, 5.0f);
|
||||
|
||||
std::vector<unsigned int> counts(128 * 128);
|
||||
fn::gfx::gpu_histogram_2d_readback(h2d, counts.data());
|
||||
|
||||
std::vector<float> density(128 * 128);
|
||||
fn::gfx::gpu_histogram_2d_to_density(counts.data(), 128, 128, density.data());
|
||||
|
||||
fn::viz::heatmap(density.data(), 128, 128, /*...*/);
|
||||
}
|
||||
|
||||
fn::gfx::gpu_histogram_2d_destroy(h2d);
|
||||
fn::ds::mc_rw2d_gpu_destroy(rw);
|
||||
```
|
||||
|
||||
## Ventaja del pipeline GPU-only
|
||||
|
||||
`mc_rw2d_gpu_trace_ssbo` retorna el SSBO de la trayectoria sin readback. Pasarlo a `gpu_histogram_2d_accumulate` evita el round-trip GPU->CPU->GPU. Solo se hace readback al final, del binner ya reducido (16 KB para 64×64 grid). Para 4096 walkers × 256 steps = 1M samples, el frame entero (run + bin + readback) es <5 ms.
|
||||
|
||||
## Notas
|
||||
|
||||
- `mc_rw2d_gpu_run` emite `barrier_storage()` — adecuado para encadenar a otro compute. Si vas a hacer readback directo, llama `barrier_buffer_update()` antes (los `readback_*` ya lo hacen internamente).
|
||||
- trace_xy se sobreescribe en cada run. curr_xy persiste — el walker continua. Para acumular trace de multiples runs, hacer readback entre runs y concat CPU-side, o reservar trace mas grande y escribir con offset (no incluido).
|
||||
- target_log_pdf se evalua dos veces por step. Si tu pdf tiene logs/exps caros, optimizar dentro del shader (precomputar constantes en u_user, usar rsqrt nativo, etc.).
|
||||
@@ -0,0 +1,246 @@
|
||||
#include "datascience/mc_session_sim_gpu.h"
|
||||
#include "gfx/gl_loader.h"
|
||||
#include "gfx/gpu_compute_program.h"
|
||||
#include "gfx/gpu_dispatch.h"
|
||||
#include "gfx/gpu_rng_glsl.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
constexpr int kMaxTiersHard = 8;
|
||||
constexpr int kSeedBinding = 9;
|
||||
|
||||
// 1 thread = 1 sesion completa. Loops internamente sobre los spins. Como
|
||||
// cada sesion es independiente y cada thread tiene su seed, no hace falta
|
||||
// ningun atomic ni barrier dentro del workgroup. La unica restriccion es
|
||||
// que cada thread procesa hasta MAX_TIERS_HARD = 8 contadores locales en
|
||||
// registros (uint local_counts[8]).
|
||||
static const char* k_body = R"glsl(
|
||||
#define MAX_TIERS 8
|
||||
|
||||
layout(std430, binding = 0) buffer Summaries { float summary[]; };
|
||||
layout(std430, binding = 1) buffer TierCounts { uint tier_counts[]; };
|
||||
layout(std430, binding = 2) buffer Tiers { vec2 tiers[]; };
|
||||
|
||||
uniform uint u_n_sessions;
|
||||
uniform uint u_n_spins;
|
||||
uniform uint u_n_tiers;
|
||||
uniform float u_p_base;
|
||||
uniform float u_cost;
|
||||
uniform float u_start_balance;
|
||||
uniform int u_mode;
|
||||
uniform float u_mode_p1;
|
||||
uniform float u_mode_p2;
|
||||
|
||||
void main() {
|
||||
uint sid = gl_GlobalInvocationID.x;
|
||||
if (sid >= u_n_sessions) return;
|
||||
|
||||
uint s = rng_seeds[sid];
|
||||
float balance = u_start_balance;
|
||||
float peak = balance;
|
||||
float trough = balance;
|
||||
float max_dd = 0.0;
|
||||
uint spins_done = 0u;
|
||||
uint wins = 0u;
|
||||
uint miss_streak = 0u;
|
||||
uint longest_miss = 0u;
|
||||
|
||||
uint local_counts[MAX_TIERS];
|
||||
for (int k = 0; k < MAX_TIERS; ++k) local_counts[k] = 0u;
|
||||
|
||||
float total_q = 0.0;
|
||||
for (uint k = 0u; k < u_n_tiers; ++k) total_q += tiers[k].x;
|
||||
float inv_total = (total_q > 0.0) ? (1.0 / total_q) : 0.0;
|
||||
|
||||
for (uint i = 0u; i < u_n_spins; ++i) {
|
||||
if (balance < u_cost) break;
|
||||
|
||||
// Effective probability (modes Pure / Pity / Streak)
|
||||
float p_eff = u_p_base;
|
||||
if (u_mode == 1) {
|
||||
float ms = float(miss_streak);
|
||||
float soft = u_mode_p1;
|
||||
float hard = u_mode_p2;
|
||||
if (ms < soft) {
|
||||
p_eff = u_p_base;
|
||||
} else if (ms >= hard) {
|
||||
p_eff = 1.0;
|
||||
} else {
|
||||
p_eff = u_p_base + (1.0 - u_p_base) *
|
||||
((ms - soft) / max(hard - soft, 1.0));
|
||||
}
|
||||
} else if (u_mode == 2) {
|
||||
p_eff = min(1.0, u_p_base * (1.0 + u_mode_p1 * float(miss_streak)));
|
||||
}
|
||||
|
||||
balance -= u_cost;
|
||||
spins_done = i + 1u;
|
||||
|
||||
float r1 = rng_uniform(s);
|
||||
if (r1 < p_eff) {
|
||||
float u = rng_uniform(s) * total_q;
|
||||
float acc = 0.0;
|
||||
uint chosen = u_n_tiers - 1u;
|
||||
for (uint k = 0u; k < u_n_tiers; ++k) {
|
||||
acc += tiers[k].x;
|
||||
if (u < acc) { chosen = k; break; }
|
||||
}
|
||||
float mult = tiers[chosen].y;
|
||||
balance += mult * u_cost;
|
||||
wins += 1u;
|
||||
if (chosen < uint(MAX_TIERS)) local_counts[chosen] += 1u;
|
||||
miss_streak = 0u;
|
||||
} else {
|
||||
miss_streak += 1u;
|
||||
if (miss_streak > longest_miss) longest_miss = miss_streak;
|
||||
}
|
||||
|
||||
if (balance > peak) peak = balance;
|
||||
if (balance < trough) trough = balance;
|
||||
float dd = peak - balance;
|
||||
if (dd > max_dd) max_dd = dd;
|
||||
}
|
||||
|
||||
rng_seeds[sid] = s;
|
||||
|
||||
uint base = sid * 8u;
|
||||
summary[base + 0u] = balance;
|
||||
summary[base + 1u] = balance - u_start_balance; // pnl
|
||||
summary[base + 2u] = max_dd;
|
||||
summary[base + 3u] = peak;
|
||||
summary[base + 4u] = trough;
|
||||
summary[base + 5u] = float(spins_done);
|
||||
summary[base + 6u] = float(wins);
|
||||
summary[base + 7u] = float(longest_miss);
|
||||
|
||||
uint cbase = sid * uint(MAX_TIERS);
|
||||
for (int k = 0; k < MAX_TIERS; ++k) {
|
||||
tier_counts[cbase + uint(k)] = local_counts[k];
|
||||
}
|
||||
|
||||
// Avoid unused-but-set warnings on inv_total if a future variant uses it.
|
||||
(void)(inv_total);
|
||||
}
|
||||
)glsl";
|
||||
|
||||
McSessionSim mc_session_sim_create(int n_sessions, int max_tiers) {
|
||||
McSessionSim s{};
|
||||
if (n_sessions <= 0) return s;
|
||||
if (max_tiers <= 0 || max_tiers > kMaxTiersHard) max_tiers = kMaxTiersHard;
|
||||
|
||||
auto rng = fn::gfx::glsl_rng_preamble(kSeedBinding);
|
||||
auto r = fn::gfx::compile_compute(k_body, 64, rng);
|
||||
if (!r.ok) {
|
||||
std::fprintf(stderr, "[mc_session_sim_gpu] compile error: %s\n",
|
||||
r.err_msg.c_str());
|
||||
return s;
|
||||
}
|
||||
s.program = r.program;
|
||||
|
||||
s.loc_n_sessions = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_sessions"));
|
||||
s.loc_n_spins = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_spins"));
|
||||
s.loc_n_tiers = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_n_tiers"));
|
||||
s.loc_p_base = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_p_base"));
|
||||
s.loc_cost = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_cost"));
|
||||
s.loc_start_bal = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_start_balance"));
|
||||
s.loc_mode = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode"));
|
||||
s.loc_mode_p1 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p1"));
|
||||
s.loc_mode_p2 = static_cast<unsigned int>(glGetUniformLocation(s.program, "u_mode_p2"));
|
||||
|
||||
s.n_sessions = n_sessions;
|
||||
s.max_tiers = max_tiers;
|
||||
|
||||
s.summary = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(n_sessions) * 8u * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.tier_counts = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(n_sessions) *
|
||||
static_cast<std::size_t>(max_tiers) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
s.tiers = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(max_tiers) * 2u * sizeof(float),
|
||||
nullptr, GL_DYNAMIC_DRAW);
|
||||
s.rng_seeds = fn::gfx::ssbo_create(
|
||||
static_cast<std::size_t>(n_sessions) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
return s;
|
||||
}
|
||||
|
||||
void mc_session_sim_reseed(McSessionSim& s, unsigned long long master_seed) {
|
||||
if (s.rng_seeds.id == 0 || s.n_sessions <= 0) return;
|
||||
std::vector<unsigned int> seeds(s.n_sessions);
|
||||
fn::gfx::seed_walkers_init(master_seed, seeds.data(), s.n_sessions);
|
||||
fn::gfx::ssbo_upload(s.rng_seeds, 0,
|
||||
static_cast<std::size_t>(s.n_sessions) * sizeof(unsigned int),
|
||||
seeds.data());
|
||||
}
|
||||
|
||||
void mc_session_sim_run(McSessionSim& s, const McSessionParams& p) {
|
||||
if (s.program == 0 || s.n_sessions <= 0) return;
|
||||
if (p.n_tiers <= 0 || p.tiers_q == nullptr || p.tiers_m == nullptr) return;
|
||||
|
||||
int nt = p.n_tiers;
|
||||
if (nt > s.max_tiers) nt = s.max_tiers;
|
||||
|
||||
// Pack tiers como vec2 (q, m) — std430 vec2 = 8 bytes alineado.
|
||||
std::vector<float> packed(static_cast<std::size_t>(s.max_tiers) * 2u, 0.0f);
|
||||
for (int i = 0; i < nt; ++i) {
|
||||
packed[2 * i + 0] = p.tiers_q[i];
|
||||
packed[2 * i + 1] = p.tiers_m[i];
|
||||
}
|
||||
fn::gfx::ssbo_upload(s.tiers, 0,
|
||||
packed.size() * sizeof(float),
|
||||
packed.data());
|
||||
|
||||
glUseProgram(s.program);
|
||||
fn::gfx::ssbo_bind(s.summary, 0);
|
||||
fn::gfx::ssbo_bind(s.tier_counts, 1);
|
||||
fn::gfx::ssbo_bind(s.tiers, 2);
|
||||
fn::gfx::ssbo_bind(s.rng_seeds, kSeedBinding);
|
||||
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_sessions), static_cast<GLuint>(s.n_sessions));
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_spins), static_cast<GLuint>(p.n_spins));
|
||||
glUniform1ui(static_cast<GLint>(s.loc_n_tiers), static_cast<GLuint>(nt));
|
||||
glUniform1f (static_cast<GLint>(s.loc_p_base), p.p_base);
|
||||
glUniform1f (static_cast<GLint>(s.loc_cost), p.cost);
|
||||
glUniform1f (static_cast<GLint>(s.loc_start_bal), p.start_balance);
|
||||
glUniform1i (static_cast<GLint>(s.loc_mode), static_cast<int>(p.mode));
|
||||
glUniform1f (static_cast<GLint>(s.loc_mode_p1), p.mode_p1);
|
||||
glUniform1f (static_cast<GLint>(s.loc_mode_p2), p.mode_p2);
|
||||
|
||||
fn::gfx::dispatch_1d(s.n_sessions, 64);
|
||||
fn::gfx::barrier_buffer_update();
|
||||
}
|
||||
|
||||
void mc_session_sim_readback_summary(const McSessionSim& s, float* out) {
|
||||
if (s.summary.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
|
||||
fn::gfx::ssbo_readback(
|
||||
s.summary, 0,
|
||||
static_cast<std::size_t>(s.n_sessions) * 8u * sizeof(float),
|
||||
out);
|
||||
}
|
||||
|
||||
void mc_session_sim_readback_tier_counts(const McSessionSim& s,
|
||||
unsigned int* out) {
|
||||
if (s.tier_counts.id == 0 || s.n_sessions <= 0 || out == nullptr) return;
|
||||
fn::gfx::ssbo_readback(
|
||||
s.tier_counts, 0,
|
||||
static_cast<std::size_t>(s.n_sessions) *
|
||||
static_cast<std::size_t>(s.max_tiers) * sizeof(unsigned int),
|
||||
out);
|
||||
}
|
||||
|
||||
void mc_session_sim_destroy(McSessionSim& s) {
|
||||
fn::gfx::delete_compute_program(s.program);
|
||||
s.program = 0;
|
||||
fn::gfx::ssbo_destroy(s.summary);
|
||||
fn::gfx::ssbo_destroy(s.tier_counts);
|
||||
fn::gfx::ssbo_destroy(s.tiers);
|
||||
fn::gfx::ssbo_destroy(s.rng_seeds);
|
||||
s.n_sessions = 0;
|
||||
}
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,79 @@
|
||||
#pragma once
|
||||
|
||||
#include "gfx/gpu_ssbo.h"
|
||||
|
||||
namespace fn::ds {
|
||||
|
||||
// Estado opaco del simulador. Cachea el programa compute, los SSBOs de
|
||||
// salida (summary + tier_counts), el SSBO de tiers (parametros) y el
|
||||
// SSBO de seeds RNG. Reusable across runs con n_sessions y max_tiers
|
||||
// constantes; si cambian hay que destroy + create.
|
||||
struct McSessionSim {
|
||||
unsigned int program = 0;
|
||||
|
||||
fn::gfx::Ssbo summary; // float[N*8]: final, pnl, max_dd, peak,
|
||||
// trough, spins, wins, longest_miss
|
||||
fn::gfx::Ssbo tier_counts; // uint[N*max_tiers]: hits por tier
|
||||
fn::gfx::Ssbo tiers; // vec2[max_tiers]: (q, m) por tier
|
||||
fn::gfx::Ssbo rng_seeds; // uint[N] PCG state por sesion
|
||||
|
||||
int n_sessions = 0;
|
||||
int max_tiers = 8;
|
||||
|
||||
// Uniform locations (cacheados)
|
||||
unsigned int loc_n_sessions = 0;
|
||||
unsigned int loc_n_spins = 0;
|
||||
unsigned int loc_n_tiers = 0;
|
||||
unsigned int loc_p_base = 0;
|
||||
unsigned int loc_cost = 0;
|
||||
unsigned int loc_start_bal = 0;
|
||||
unsigned int loc_mode = 0;
|
||||
unsigned int loc_mode_p1 = 0;
|
||||
unsigned int loc_mode_p2 = 0;
|
||||
};
|
||||
|
||||
enum class McSessionMode : int {
|
||||
Pure = 0, // p_eff = p_base (sin pity ni streak)
|
||||
Pity = 1, // ramp lineal p_base -> 1 entre soft y hard miss_streak
|
||||
Streak = 2 // p_eff = min(1, p_base * (1 + lambda * miss_streak))
|
||||
};
|
||||
|
||||
struct McSessionParams {
|
||||
float p_base = 0.2f;
|
||||
float cost = 1.0f;
|
||||
float start_balance = 100.0f;
|
||||
int n_spins = 1000;
|
||||
int n_tiers = 0; // <= max_tiers del create
|
||||
const float* tiers_q = nullptr; // [n_tiers] pesos relativos (no normalizados)
|
||||
const float* tiers_m = nullptr; // [n_tiers] multiplicadores de payout
|
||||
McSessionMode mode = McSessionMode::Pure;
|
||||
float mode_p1 = 0.0f; // pity.soft / streak.lambda
|
||||
float mode_p2 = 0.0f; // pity.hard (no usado en streak)
|
||||
};
|
||||
|
||||
// Crea el simulador para N sesiones y hasta max_tiers tiers distintos.
|
||||
// Compila el compute y reserva los SSBOs.
|
||||
McSessionSim mc_session_sim_create(int n_sessions, int max_tiers = 8);
|
||||
|
||||
// Re-siembra los seeds RNG a partir de master_seed (deterministico). Llamar
|
||||
// al menos una vez antes del primer run; subsiguientes runs persisten el
|
||||
// state (ofreciendo "continuacion" — util para acumular mas spins).
|
||||
void mc_session_sim_reseed(McSessionSim& s, unsigned long long master_seed);
|
||||
|
||||
// Ejecuta una corrida completa: cada uno de los N sessions independientes
|
||||
// hace n_spins spins. Bloquea hasta que el dispatch termina y el barrier
|
||||
// esta satisfecho. NO hace readback — el caller decide cuando leer.
|
||||
void mc_session_sim_run(McSessionSim& s, const McSessionParams& p);
|
||||
|
||||
// Lee summary a CPU. out debe tener n_sessions * 8 floats.
|
||||
// Layout por sesion: [final_balance, pnl, max_dd, peak, trough, spins,
|
||||
// wins, longest_miss].
|
||||
void mc_session_sim_readback_summary(const McSessionSim& s, float* out);
|
||||
|
||||
// Lee tier_counts a CPU. out debe tener n_sessions * max_tiers uints.
|
||||
// Layout: out[sid * max_tiers + tier_idx] = hits del tier_idx en la sesion.
|
||||
void mc_session_sim_readback_tier_counts(const McSessionSim& s, unsigned int* out);
|
||||
|
||||
void mc_session_sim_destroy(McSessionSim& s);
|
||||
|
||||
} // namespace fn::ds
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
name: mc_session_sim_gpu
|
||||
kind: function
|
||||
lang: cpp
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "McSessionSim mc_session_sim_create(int n_sessions, int max_tiers); void mc_session_sim_reseed(McSessionSim&, uint64 seed); void mc_session_sim_run(McSessionSim&, const McSessionParams&); void mc_session_sim_readback_summary(const McSessionSim&, float* out); void mc_session_sim_readback_tier_counts(const McSessionSim&, unsigned int* out); void mc_session_sim_destroy(McSessionSim&)"
|
||||
description: "N sesiones independientes de K spins en paralelo en GPU (1 thread = 1 sesion). Implementa el modelo variable-ratio escalonado de vr_tiered_lab: tiers (q, m), modes Pure/Pity/Streak, miss_streak, drawdown. Output SSBOs: summary[N*8] + tier_counts[N*max_tiers]."
|
||||
tags: [montecarlo, gpu, simulation, vr_tiered, sessions, datascience]
|
||||
uses_functions: ["gl_loader_cpp_gfx", "gpu_ssbo_cpp_gfx", "gpu_compute_program_cpp_gfx", "gpu_dispatch_cpp_gfx", "gpu_rng_glsl_cpp_gfx"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [GL/gl.h, GL/glext.h, vector, cstdio]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "cpp/functions/datascience/mc_session_sim_gpu.cpp"
|
||||
framework: opengl
|
||||
params:
|
||||
- name: n_sessions
|
||||
desc: "Numero de sesiones independientes a simular en paralelo. 1 thread por sesion. Tipico: 10^4 - 10^6."
|
||||
- name: max_tiers
|
||||
desc: "Maximo de tiers que el simulador soportara (cota dura del shader = 8). Reservar el max que necesites; los runs pueden usar menos."
|
||||
- name: master_seed
|
||||
desc: "Semilla base. Cada sesion deriva su PCG state via SplitMix64 (deterministico bit-exacto)."
|
||||
- name: p
|
||||
desc: "McSessionParams: p_base, cost, start_balance, n_spins, n_tiers (<= max_tiers), tiers_q[n_tiers], tiers_m[n_tiers], mode, mode_p1, mode_p2."
|
||||
- name: out
|
||||
desc: "(summary) float[N*8]; (tier_counts) uint[N*max_tiers]."
|
||||
output: "Tras run, summary tiene 8 floats por sesion: [final_balance, pnl, max_dd, peak, trough, spins, wins, longest_miss]. tier_counts tiene un uint por (sesion, tier_idx). El SSBO de seeds se actualiza para permitir runs subsiguientes que continuen la cadena RNG."
|
||||
---
|
||||
|
||||
# mc_session_sim_gpu
|
||||
|
||||
Kernel GPU que portea el simulador de sesiones de `vr_tiered_lab_v2.jsx`. Cada sesion vive entera en un thread con state local en registros — sin atomics, sin shared memory, paralelismo trivial.
|
||||
|
||||
## Performance esperada
|
||||
|
||||
RTX 3070, 5888 cores: 10^6 sesiones × 10^4 spins = **10^10 ops Monte Carlo** en ~1-3 s. Comparado con CPU single-thread (~60 s) el speedup es 20-60x.
|
||||
|
||||
## Patron de uso
|
||||
|
||||
```cpp
|
||||
auto sim = fn::ds::mc_session_sim_create(/*n_sessions=*/100'000, /*max_tiers=*/5);
|
||||
fn::ds::mc_session_sim_reseed(sim, 0xC0FFEE);
|
||||
|
||||
// Mid vol preset
|
||||
float qs[] = {0.7f, 0.22f, 0.07f, 0.01f};
|
||||
float ms[] = {1.5f, 4.0f, 15.0f, 100.0f};
|
||||
|
||||
fn::ds::McSessionParams p{};
|
||||
p.p_base = 0.20f;
|
||||
p.cost = 1.0f;
|
||||
p.start_balance = 100.0f;
|
||||
p.n_spins = 1000;
|
||||
p.n_tiers = 4;
|
||||
p.tiers_q = qs;
|
||||
p.tiers_m = ms;
|
||||
p.mode = fn::ds::McSessionMode::Pity;
|
||||
p.mode_p1 = 5.0f; // pity soft = 5 misses
|
||||
p.mode_p2 = 20.0f; // pity hard = 20 misses
|
||||
|
||||
fn::ds::mc_session_sim_run(sim, p);
|
||||
|
||||
std::vector<float> summary(100'000 * 8);
|
||||
fn::ds::mc_session_sim_readback_summary(sim, summary.data());
|
||||
|
||||
// summary[sid*8 + 1] = pnl de la sesion sid
|
||||
// histograma de pnl para distribucion -> stats_summary, drawdown, etc.
|
||||
|
||||
fn::ds::mc_session_sim_destroy(sim);
|
||||
```
|
||||
|
||||
## Modes
|
||||
|
||||
| mode | mode_p1 | mode_p2 | Descripcion |
|
||||
|---|---|---|---|
|
||||
| Pure | — | — | p_eff = p_base (sin compensacion) |
|
||||
| Pity | soft | hard | p_eff sube linealmente p_base->1 entre soft y hard miss_streak |
|
||||
| Streak | lambda | — | p_eff = min(1, p_base * (1 + lambda * miss_streak)) |
|
||||
|
||||
## Layout summary[sid * 8 + k]
|
||||
|
||||
| k | Campo |
|
||||
|---|---|
|
||||
| 0 | final_balance |
|
||||
| 1 | pnl (= final - start) |
|
||||
| 2 | max_dd (peak-to-trough absoluto) |
|
||||
| 3 | peak |
|
||||
| 4 | trough |
|
||||
| 5 | spins (= n_spins o menor si rota antes por ruina) |
|
||||
| 6 | wins (numero de hits) |
|
||||
| 7 | longest_miss |
|
||||
|
||||
## Notas
|
||||
|
||||
- max_tiers esta cap a 8 (cota dura del shader, registros locales). Si necesitas mas, usar variante con SSBO de counters por sesion en vez de registros.
|
||||
- Tras `run`, el SSBO de seeds RNG queda actualizado. Llamar `run` otra vez con `n_spins` adicional simula los siguientes spins continuando la cadena (no rearranca). Util para "stream" sesiones largas en chunks.
|
||||
- Para batches mas grandes que VRAM (n_sessions > 10^7 con summary float[8] = 320 MB), partir en sub-batches y readback entre llamadas.
|
||||
- Compatibilidad GL 4.3+ (compute shaders + atomicAdd uint). RTX 3070 expone GL 4.6, sobra.
|
||||
Reference in New Issue
Block a user