c74fd4ae0d
Stack base de compute shaders OpenGL 4.3 para cargas Monte Carlo intensivas en GPU. Reutiliza el patron de graph_force_layout_gpu (SSBO + compute) y se integra con el resto del registry sin nuevos simbolos en gl_loader (todo lo que se necesita ya estaba expuesto). - gpu_ssbo: lifecycle de Shader Storage Buffer Objects. - gpu_compute_program: compila compute GLSL 4.3 con preamble inyectable (mismo pattern de gl_shader::compile_fragment). - gpu_dispatch: dispatch_1d/2d/3d con ceil(N/local) automatico + barrier helpers (storage, uniform, image, buffer_update, all). - gpu_rng_glsl: PCG32 GLSL (uniform/normal/below) + SplitMix64 seed walkers para sembrar deterministicamente N walkers desde un master seed. - gpu_histogram_1d: SSBO float[N] -> uint[nbins] via atomicAdd. - gpu_histogram_2d: SSBO float[2N] xy-interleaved -> uint[nx*ny] + to_density helper para alimentar heatmap_cpp_viz. - gpu_reduce: workgroup-shared sum/min/max/mean (local 256, partials CPU). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
132 lines
4.8 KiB
C++
132 lines
4.8 KiB
C++
#include "gfx/gpu_histogram_2d.h"
|
|
#include "gfx/gl_loader.h"
|
|
#include "gfx/gpu_compute_program.h"
|
|
#include "gfx/gpu_dispatch.h"
|
|
|
|
#include <cstdio>
|
|
#include <vector>
|
|
|
|
namespace fn::gfx {
|
|
|
|
// Samples se almacenan como float[2*count] xy-interleaved, accediendo via
|
|
// vec2(samples[2*i], samples[2*i+1]). Esto evita preocuparse por el padding
|
|
// de vec2 en std430 (que en GL es 8 bytes, ok, pero al pasar por CPU
|
|
// flotantes sueltos es mas portable).
|
|
static const char* k_accum_body = R"glsl(
|
|
layout(std430, binding = 0) readonly buffer Samples { float samples[]; };
|
|
layout(std430, binding = 1) coherent buffer Bins { uint bins[]; };
|
|
uniform uint u_count;
|
|
uniform uint u_nx;
|
|
uniform uint u_ny;
|
|
uniform vec2 u_min; // (xmin, ymin)
|
|
uniform vec2 u_inv_range; // (1/xrange, 1/yrange)
|
|
void main() {
|
|
uint i = gl_GlobalInvocationID.x;
|
|
if (i >= u_count) return;
|
|
float x = samples[2u * i + 0u];
|
|
float y = samples[2u * i + 1u];
|
|
float tx = (x - u_min.x) * u_inv_range.x;
|
|
float ty = (y - u_min.y) * u_inv_range.y;
|
|
if (tx < 0.0 || tx >= 1.0 || ty < 0.0 || ty >= 1.0) return;
|
|
uint bx = uint(tx * float(u_nx));
|
|
uint by = uint(ty * float(u_ny));
|
|
if (bx >= u_nx) bx = u_nx - 1u;
|
|
if (by >= u_ny) by = u_ny - 1u;
|
|
atomicAdd(bins[by * u_nx + bx], 1u);
|
|
}
|
|
)glsl";
|
|
|
|
GpuHistogram2D gpu_histogram_2d_create(int nx, int ny) {
|
|
GpuHistogram2D h{};
|
|
if (nx <= 0 || ny <= 0) return h;
|
|
|
|
auto r = compile_compute(k_accum_body, 64, "");
|
|
if (!r.ok) {
|
|
std::fprintf(stderr, "[gpu_histogram_2d] compile error: %s\n",
|
|
r.err_msg.c_str());
|
|
return h;
|
|
}
|
|
h.program = r.program;
|
|
h.loc_count = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_count"));
|
|
h.loc_nx = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_nx"));
|
|
h.loc_ny = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_ny"));
|
|
h.loc_min = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_min"));
|
|
h.loc_inv_range = static_cast<unsigned int>(glGetUniformLocation(h.program, "u_inv_range"));
|
|
h.nx = nx;
|
|
h.ny = ny;
|
|
h.bins = ssbo_create(static_cast<std::size_t>(nx) *
|
|
static_cast<std::size_t>(ny) * sizeof(unsigned int),
|
|
nullptr, GL_DYNAMIC_COPY);
|
|
gpu_histogram_2d_clear(h);
|
|
return h;
|
|
}
|
|
|
|
void gpu_histogram_2d_clear(GpuHistogram2D& h) {
|
|
if (h.bins.id == 0) return;
|
|
std::size_t total = static_cast<std::size_t>(h.nx) *
|
|
static_cast<std::size_t>(h.ny);
|
|
std::vector<unsigned int> zeros(total, 0u);
|
|
ssbo_upload(h.bins, 0, total * sizeof(unsigned int), zeros.data());
|
|
}
|
|
|
|
void gpu_histogram_2d_accumulate(GpuHistogram2D& h,
|
|
const Ssbo& samples_xy,
|
|
int count,
|
|
float xmin, float xmax,
|
|
float ymin, float ymax) {
|
|
if (h.program == 0 || count <= 0) return;
|
|
float xr = xmax - xmin;
|
|
float yr = ymax - ymin;
|
|
if (xr <= 0.0f || yr <= 0.0f) return;
|
|
|
|
glUseProgram(h.program);
|
|
ssbo_bind(samples_xy, 0);
|
|
ssbo_bind(h.bins, 1);
|
|
|
|
glUniform1ui(static_cast<GLint>(h.loc_count), static_cast<GLuint>(count));
|
|
glUniform1ui(static_cast<GLint>(h.loc_nx), static_cast<GLuint>(h.nx));
|
|
glUniform1ui(static_cast<GLint>(h.loc_ny), static_cast<GLuint>(h.ny));
|
|
glUniform2f(static_cast<GLint>(h.loc_min), xmin, ymin);
|
|
glUniform2f(static_cast<GLint>(h.loc_inv_range), 1.0f / xr, 1.0f / yr);
|
|
|
|
dispatch_1d(count, 64);
|
|
barrier_storage();
|
|
}
|
|
|
|
void gpu_histogram_2d_readback(const GpuHistogram2D& h, unsigned int* out) {
|
|
if (h.bins.id == 0 || out == nullptr) return;
|
|
barrier_buffer_update();
|
|
std::size_t total = static_cast<std::size_t>(h.nx) *
|
|
static_cast<std::size_t>(h.ny);
|
|
ssbo_readback(h.bins, 0, total * sizeof(unsigned int), out);
|
|
}
|
|
|
|
void gpu_histogram_2d_to_density(const unsigned int* counts, int nx, int ny,
|
|
float* out_density) {
|
|
if (counts == nullptr || out_density == nullptr || nx <= 0 || ny <= 0) return;
|
|
std::size_t total = static_cast<std::size_t>(nx) *
|
|
static_cast<std::size_t>(ny);
|
|
unsigned int max_c = 0u;
|
|
for (std::size_t i = 0; i < total; ++i) {
|
|
if (counts[i] > max_c) max_c = counts[i];
|
|
}
|
|
if (max_c == 0u) {
|
|
for (std::size_t i = 0; i < total; ++i) out_density[i] = 0.0f;
|
|
return;
|
|
}
|
|
float inv = 1.0f / static_cast<float>(max_c);
|
|
for (std::size_t i = 0; i < total; ++i) {
|
|
out_density[i] = static_cast<float>(counts[i]) * inv;
|
|
}
|
|
}
|
|
|
|
void gpu_histogram_2d_destroy(GpuHistogram2D& h) {
|
|
delete_compute_program(h.program);
|
|
h.program = 0;
|
|
ssbo_destroy(h.bins);
|
|
h.nx = 0;
|
|
h.ny = 0;
|
|
}
|
|
|
|
} // namespace fn::gfx
|