feat(cpp/gfx): GPU compute primitives for Monte Carlo (G1-G7)
Stack base de compute shaders OpenGL 4.3 para cargas Monte Carlo intensivas en GPU. Reutiliza el patron de graph_force_layout_gpu (SSBO + compute) y se integra con el resto del registry sin nuevos simbolos en gl_loader (todo lo que se necesita ya estaba expuesto). - gpu_ssbo: lifecycle de Shader Storage Buffer Objects. - gpu_compute_program: compila compute GLSL 4.3 con preamble inyectable (mismo pattern de gl_shader::compile_fragment). - gpu_dispatch: dispatch_1d/2d/3d con ceil(N/local) automatico + barrier helpers (storage, uniform, image, buffer_update, all). - gpu_rng_glsl: PCG32 GLSL (uniform/normal/below) + SplitMix64 seed walkers para sembrar deterministicamente N walkers desde un master seed. - gpu_histogram_1d: SSBO float[N] -> uint[nbins] via atomicAdd. - gpu_histogram_2d: SSBO float[2N] xy-interleaved -> uint[nx*ny] + to_density helper para alimentar heatmap_cpp_viz. - gpu_reduce: workgroup-shared sum/min/max/mean (local 256, partials CPU). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
#include "gfx/gpu_histogram_1d.h"
|
||||
#include "gfx/gl_loader.h"
|
||||
#include "gfx/gpu_compute_program.h"
|
||||
#include "gfx/gpu_dispatch.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
namespace fn::gfx {
|
||||
|
||||
// Pass 1: zero out bins. 1 thread por bin.
|
||||
static const char* k_clear_body = R"glsl(
|
||||
layout(std430, binding = 1) buffer Bins { uint bins[]; };
|
||||
uniform uint u_nbins;
|
||||
void main() {
|
||||
uint i = gl_GlobalInvocationID.x;
|
||||
if (i < u_nbins) bins[i] = 0u;
|
||||
}
|
||||
)glsl";
|
||||
|
||||
// Pass 2: 1 thread por sample. floor((x - min) * inv_range * nbins).
|
||||
// Samples fuera del rango se descartan. atomicAdd contiguo: low contention
|
||||
// salvo que la distribucion este muy concentrada en pocos bins (caso real:
|
||||
// poco probable; si pasa, usar shared-memory bins por workgroup como
|
||||
// optimizacion futura).
|
||||
static const char* k_accum_body = R"glsl(
|
||||
layout(std430, binding = 0) readonly buffer Samples { float samples[]; };
|
||||
layout(std430, binding = 1) coherent buffer Bins { uint bins[]; };
|
||||
uniform uint u_count;
|
||||
uniform uint u_nbins;
|
||||
uniform float u_min;
|
||||
uniform float u_inv_range;
|
||||
void main() {
|
||||
uint i = gl_GlobalInvocationID.x;
|
||||
if (i >= u_count) return;
|
||||
float x = samples[i];
|
||||
float t = (x - u_min) * u_inv_range; // [0, 1) si dentro
|
||||
if (t < 0.0 || t >= 1.0) return;
|
||||
uint b = uint(t * float(u_nbins));
|
||||
if (b >= u_nbins) b = u_nbins - 1u;
|
||||
atomicAdd(bins[b], 1u);
|
||||
}
|
||||
)glsl";
|
||||
|
||||
GpuHistogram1D gpu_histogram_1d_create(int nbins) {
|
||||
GpuHistogram1D h{};
|
||||
if (nbins <= 0) return h;
|
||||
|
||||
// Programa "accumulate": el clear lo hacemos por glClearBufferData o
|
||||
// un re-upload de zeros (mas simple que un segundo programa, igual
|
||||
// throughput para nbins moderados <= 65536).
|
||||
auto r = compile_compute(k_accum_body, 64, "");
|
||||
if (!r.ok) {
|
||||
std::fprintf(stderr, "[gpu_histogram_1d] compile error: %s\n",
|
||||
r.err_msg.c_str());
|
||||
return h;
|
||||
}
|
||||
h.program = r.program;
|
||||
h.loc_count = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_count"));
|
||||
h.loc_nbins = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_nbins"));
|
||||
h.loc_min = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_min"));
|
||||
h.loc_inv_range = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_inv_range"));
|
||||
|
||||
h.bins = ssbo_create(static_cast<std::size_t>(nbins) * sizeof(unsigned int),
|
||||
nullptr, GL_DYNAMIC_COPY);
|
||||
h.nbins = nbins;
|
||||
|
||||
// Inicializar a cero
|
||||
gpu_histogram_1d_clear(h);
|
||||
(void)k_clear_body; // (reservado para futura optimizacion shared-mem)
|
||||
return h;
|
||||
}
|
||||
|
||||
void gpu_histogram_1d_clear(GpuHistogram1D& h) {
|
||||
if (h.bins.id == 0 || h.nbins <= 0) return;
|
||||
std::vector<unsigned int> zeros(static_cast<std::size_t>(h.nbins), 0u);
|
||||
ssbo_upload(h.bins, 0,
|
||||
static_cast<std::size_t>(h.nbins) * sizeof(unsigned int),
|
||||
zeros.data());
|
||||
}
|
||||
|
||||
void gpu_histogram_1d_accumulate(GpuHistogram1D& h,
|
||||
const Ssbo& samples,
|
||||
int count,
|
||||
float range_min,
|
||||
float range_max) {
|
||||
if (h.program == 0 || count <= 0) return;
|
||||
float range = range_max - range_min;
|
||||
if (range <= 0.0f) return;
|
||||
|
||||
glUseProgram(h.program);
|
||||
ssbo_bind(samples, 0);
|
||||
ssbo_bind(h.bins, 1);
|
||||
|
||||
glUniform1ui(static_cast<GLint>(h.loc_count), static_cast<GLuint>(count));
|
||||
glUniform1ui(static_cast<GLint>(h.loc_nbins), static_cast<GLuint>(h.nbins));
|
||||
glUniform1f(static_cast<GLint>(h.loc_min), range_min);
|
||||
glUniform1f(static_cast<GLint>(h.loc_inv_range), 1.0f / range);
|
||||
|
||||
dispatch_1d(count, 64);
|
||||
barrier_storage();
|
||||
}
|
||||
|
||||
void gpu_histogram_1d_readback(const GpuHistogram1D& h, unsigned int* out) {
|
||||
if (h.bins.id == 0 || h.nbins <= 0 || out == nullptr) return;
|
||||
barrier_buffer_update();
|
||||
ssbo_readback(h.bins, 0,
|
||||
static_cast<std::size_t>(h.nbins) * sizeof(unsigned int),
|
||||
out);
|
||||
}
|
||||
|
||||
void gpu_histogram_1d_destroy(GpuHistogram1D& h) {
|
||||
delete_compute_program(h.program);
|
||||
h.program = 0;
|
||||
ssbo_destroy(h.bins);
|
||||
h.nbins = 0;
|
||||
}
|
||||
|
||||
} // namespace fn::gfx
|
||||
Reference in New Issue
Block a user