feat(cpp/gfx): GPU compute primitives for Monte Carlo (G1-G7)

Stack base de compute shaders OpenGL 4.3 para cargas Monte Carlo intensivas
en GPU. Reutiliza el patron de graph_force_layout_gpu (SSBO + compute) y se
integra con el resto del registry sin nuevos simbolos en gl_loader (todo lo
que se necesita ya estaba expuesto).

- gpu_ssbo: lifecycle de Shader Storage Buffer Objects.
- gpu_compute_program: compila compute GLSL 4.3 con preamble inyectable
  (mismo pattern de gl_shader::compile_fragment).
- gpu_dispatch: dispatch_1d/2d/3d con ceil(N/local) automatico + barrier
  helpers (storage, uniform, image, buffer_update, all).
- gpu_rng_glsl: PCG32 GLSL (uniform/normal/below) + SplitMix64 seed walkers
  para sembrar deterministicamente N walkers desde un master seed.
- gpu_histogram_1d: SSBO float[N] -> uint[nbins] via atomicAdd.
- gpu_histogram_2d: SSBO float[2N] xy-interleaved -> uint[nx*ny] +
  to_density helper para alimentar heatmap_cpp_viz.
- gpu_reduce: workgroup-shared sum/min/max/mean (local 256, partials CPU).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 11:52:08 +02:00
parent 92297e02c5
commit c74fd4ae0d
21 changed files with 1544 additions and 0 deletions
+137
View File
@@ -0,0 +1,137 @@
#include "gfx/gpu_reduce.h"
#include "gfx/gl_loader.h"
#include "gfx/gpu_compute_program.h"
#include "gfx/gpu_dispatch.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <limits>
#include <vector>
namespace fn::gfx {
constexpr int kLocal = 256;
// Workgroup-shared tree reduction. Identidad e operador se inyectan via
// preamble (#define IDENTITY / OP). Cada workgroup escribe 1 partial.
static const char* k_body = R"glsl(
layout(std430, binding = 0) readonly buffer In { float in_data[]; };
layout(std430, binding = 1) buffer Out { float partials[]; };
uniform uint u_count;
shared float sdata[256];
void main() {
uint tid = gl_LocalInvocationID.x;
uint i = gl_GlobalInvocationID.x;
sdata[tid] = (i < u_count) ? in_data[i] : IDENTITY;
barrier();
for (uint s = 128u; s > 0u; s >>= 1u) {
if (tid < s) sdata[tid] = OP(sdata[tid], sdata[tid + s]);
barrier();
}
if (tid == 0u) partials[gl_WorkGroupID.x] = sdata[0];
}
)glsl";
static unsigned int compile_op(const char* identity, const char* op) {
char preamble[256];
std::snprintf(preamble, sizeof(preamble),
"#define IDENTITY (%s)\n"
"#define OP(a,b) (%s)\n",
identity, op);
auto r = compile_compute(k_body, kLocal, preamble);
if (!r.ok) {
std::fprintf(stderr, "[gpu_reduce] compile error: %s\n", r.err_msg.c_str());
return 0;
}
return r.program;
}
GpuReduce gpu_reduce_create(int max_n_samples) {
GpuReduce r{};
if (max_n_samples <= 0) return r;
r.p_sum = compile_op("0.0", "(a) + (b)");
r.p_min = compile_op("3.4028235e38", "min((a),(b))");
r.p_max = compile_op("-3.4028235e38", "max((a),(b))");
if (r.p_sum) r.loc_count = static_cast<unsigned int>(glGetUniformLocation(r.p_sum, "u_count"));
if (r.p_min) r.loc_count_min = static_cast<unsigned int>(glGetUniformLocation(r.p_min, "u_count"));
if (r.p_max) r.loc_count_max = static_cast<unsigned int>(glGetUniformLocation(r.p_max, "u_count"));
r.max_groups = (max_n_samples + kLocal - 1) / kLocal;
r.partials = ssbo_create(static_cast<std::size_t>(r.max_groups) * sizeof(float),
nullptr, GL_DYNAMIC_COPY);
return r;
}
static float reduce_dispatch(unsigned int program, unsigned int loc_count,
const Ssbo& partials, int max_groups,
const Ssbo& samples, int count, ReduceOp op) {
if (program == 0 || count <= 0) {
return (op == ReduceOp::Min) ? std::numeric_limits<float>::infinity()
: (op == ReduceOp::Max) ? -std::numeric_limits<float>::infinity()
: 0.0f;
}
glUseProgram(program);
ssbo_bind(samples, 0);
ssbo_bind(partials, 1);
glUniform1ui(static_cast<GLint>(loc_count), static_cast<GLuint>(count));
dispatch_1d(count, kLocal);
barrier_buffer_update();
int groups = (count + kLocal - 1) / kLocal;
if (groups > max_groups) groups = max_groups;
std::vector<float> host(groups);
ssbo_readback(partials, 0, static_cast<std::size_t>(groups) * sizeof(float),
host.data());
if (groups == 0) {
return (op == ReduceOp::Min) ? std::numeric_limits<float>::infinity()
: (op == ReduceOp::Max) ? -std::numeric_limits<float>::infinity()
: 0.0f;
}
float acc = host[0];
for (int i = 1; i < groups; ++i) {
switch (op) {
case ReduceOp::Sum: acc += host[i]; break;
case ReduceOp::Min: acc = std::min(acc, host[i]); break;
case ReduceOp::Max: acc = std::max(acc, host[i]); break;
}
}
return acc;
}
float gpu_reduce_run(GpuReduce& r, ReduceOp op,
const Ssbo& samples, int count) {
switch (op) {
case ReduceOp::Sum:
return reduce_dispatch(r.p_sum, r.loc_count, r.partials,
r.max_groups, samples, count, op);
case ReduceOp::Min:
return reduce_dispatch(r.p_min, r.loc_count_min, r.partials,
r.max_groups, samples, count, op);
case ReduceOp::Max:
return reduce_dispatch(r.p_max, r.loc_count_max, r.partials,
r.max_groups, samples, count, op);
}
return 0.0f;
}
float gpu_reduce_mean(GpuReduce& r, const Ssbo& samples, int count) {
if (count <= 0) return 0.0f;
float s = gpu_reduce_run(r, ReduceOp::Sum, samples, count);
return s / static_cast<float>(count);
}
void gpu_reduce_destroy(GpuReduce& r) {
delete_compute_program(r.p_sum);
delete_compute_program(r.p_min);
delete_compute_program(r.p_max);
r.p_sum = r.p_min = r.p_max = 0;
ssbo_destroy(r.partials);
r.max_groups = 0;
}
} // namespace fn::gfx