feat(cpp/gfx): GPU compute primitives for Monte Carlo (G1-G7)

Stack base de compute shaders OpenGL 4.3 para cargas Monte Carlo intensivas en GPU. Reutiliza el patron de graph_force_layout_gpu (SSBO + compute) y se integra con el resto del registry sin nuevos simbolos en gl_loader (todo lo que se necesita ya estaba expuesto). - gpu_ssbo: lifecycle de Shader Storage Buffer Objects. - gpu_compute_program: compila compute GLSL 4.3 con preamble inyectable (mismo pattern de gl_shader::compile_fragment). - gpu_dispatch: dispatch_1d/2d/3d con ceil(N/local) automatico + barrier helpers (storage, uniform, image, buffer_update, all). - gpu_rng_glsl: PCG32 GLSL (uniform/normal/below) + SplitMix64 seed walkers para sembrar deterministicamente N walkers desde un master seed. - gpu_histogram_1d: SSBO float[N] -> uint[nbins] via atomicAdd. - gpu_histogram_2d: SSBO float[2N] xy-interleaved -> uint[nx*ny] + to_density helper para alimentar heatmap_cpp_viz. - gpu_reduce: workgroup-shared sum/min/max/mean (local 256, partials CPU). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 11:52:08 +02:00
parent 92297e02c5
commit c74fd4ae0d
21 changed files with 1544 additions and 0 deletions
@@ -0,0 +1,137 @@
+#include "gfx/gpu_reduce.h"
+#include "gfx/gl_loader.h"
+#include "gfx/gpu_compute_program.h"
+#include "gfx/gpu_dispatch.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <limits>
+#include <vector>
+
+namespace fn::gfx {
+
+constexpr int kLocal = 256;
+
+// Workgroup-shared tree reduction. Identidad e operador se inyectan via
+// preamble (#define IDENTITY / OP). Cada workgroup escribe 1 partial.
+static const char* k_body = R"glsl(
+layout(std430, binding = 0) readonly buffer In  { float in_data[];  };
+layout(std430, binding = 1)          buffer Out { float partials[]; };
+uniform uint u_count;
+shared float sdata[256];
+
+void main() {
+    uint tid = gl_LocalInvocationID.x;
+    uint i   = gl_GlobalInvocationID.x;
+    sdata[tid] = (i < u_count) ? in_data[i] : IDENTITY;
+    barrier();
+    for (uint s = 128u; s > 0u; s >>= 1u) {
+        if (tid < s) sdata[tid] = OP(sdata[tid], sdata[tid + s]);
+        barrier();
+    }
+    if (tid == 0u) partials[gl_WorkGroupID.x] = sdata[0];
+}
+)glsl";
+
+static unsigned int compile_op(const char* identity, const char* op) {
+    char preamble[256];
+    std::snprintf(preamble, sizeof(preamble),
+                  "#define IDENTITY (%s)\n"
+                  "#define OP(a,b) (%s)\n",
+                  identity, op);
+    auto r = compile_compute(k_body, kLocal, preamble);
+    if (!r.ok) {
+        std::fprintf(stderr, "[gpu_reduce] compile error: %s\n", r.err_msg.c_str());
+        return 0;
+    }
+    return r.program;
+}
+
+GpuReduce gpu_reduce_create(int max_n_samples) {
+    GpuReduce r{};
+    if (max_n_samples <= 0) return r;
+
+    r.p_sum = compile_op("0.0", "(a) + (b)");
+    r.p_min = compile_op("3.4028235e38",  "min((a),(b))");
+    r.p_max = compile_op("-3.4028235e38", "max((a),(b))");
+
+    if (r.p_sum) r.loc_count     = static_cast<unsigned int>(glGetUniformLocation(r.p_sum, "u_count"));
+    if (r.p_min) r.loc_count_min = static_cast<unsigned int>(glGetUniformLocation(r.p_min, "u_count"));
+    if (r.p_max) r.loc_count_max = static_cast<unsigned int>(glGetUniformLocation(r.p_max, "u_count"));
+
+    r.max_groups = (max_n_samples + kLocal - 1) / kLocal;
+    r.partials = ssbo_create(static_cast<std::size_t>(r.max_groups) * sizeof(float),
+                             nullptr, GL_DYNAMIC_COPY);
+    return r;
+}
+
+static float reduce_dispatch(unsigned int program, unsigned int loc_count,
+                             const Ssbo& partials, int max_groups,
+                             const Ssbo& samples, int count, ReduceOp op) {
+    if (program == 0 || count <= 0) {
+        return (op == ReduceOp::Min) ?  std::numeric_limits<float>::infinity()
+             : (op == ReduceOp::Max) ? -std::numeric_limits<float>::infinity()
+             : 0.0f;
+    }
+    glUseProgram(program);
+    ssbo_bind(samples,  0);
+    ssbo_bind(partials, 1);
+    glUniform1ui(static_cast<GLint>(loc_count), static_cast<GLuint>(count));
+    dispatch_1d(count, kLocal);
+    barrier_buffer_update();
+
+    int groups = (count + kLocal - 1) / kLocal;
+    if (groups > max_groups) groups = max_groups;
+    std::vector<float> host(groups);
+    ssbo_readback(partials, 0, static_cast<std::size_t>(groups) * sizeof(float),
+                  host.data());
+
+    if (groups == 0) {
+        return (op == ReduceOp::Min) ?  std::numeric_limits<float>::infinity()
+             : (op == ReduceOp::Max) ? -std::numeric_limits<float>::infinity()
+             : 0.0f;
+    }
+    float acc = host[0];
+    for (int i = 1; i < groups; ++i) {
+        switch (op) {
+            case ReduceOp::Sum: acc += host[i]; break;
+            case ReduceOp::Min: acc = std::min(acc, host[i]); break;
+            case ReduceOp::Max: acc = std::max(acc, host[i]); break;
+        }
+    }
+    return acc;
+}
+
+float gpu_reduce_run(GpuReduce& r, ReduceOp op,
+                     const Ssbo& samples, int count) {
+    switch (op) {
+        case ReduceOp::Sum:
+            return reduce_dispatch(r.p_sum, r.loc_count, r.partials,
+                                   r.max_groups, samples, count, op);
+        case ReduceOp::Min:
+            return reduce_dispatch(r.p_min, r.loc_count_min, r.partials,
+                                   r.max_groups, samples, count, op);
+        case ReduceOp::Max:
+            return reduce_dispatch(r.p_max, r.loc_count_max, r.partials,
+                                   r.max_groups, samples, count, op);
+    }
+    return 0.0f;
+}
+
+float gpu_reduce_mean(GpuReduce& r, const Ssbo& samples, int count) {
+    if (count <= 0) return 0.0f;
+    float s = gpu_reduce_run(r, ReduceOp::Sum, samples, count);
+    return s / static_cast<float>(count);
+}
+
+void gpu_reduce_destroy(GpuReduce& r) {
+    delete_compute_program(r.p_sum);
+    delete_compute_program(r.p_min);
+    delete_compute_program(r.p_max);
+    r.p_sum = r.p_min = r.p_max = 0;
+    ssbo_destroy(r.partials);
+    r.max_groups = 0;
+}
+
+} // namespace fn::gfx