#include "gfx/gpu_reduce.h" #include "gfx/gl_loader.h" #include "gfx/gpu_compute_program.h" #include "gfx/gpu_dispatch.h" #include #include #include #include #include namespace fn::gfx { constexpr int kLocal = 256; // Workgroup-shared tree reduction. Identidad e operador se inyectan via // preamble (#define IDENTITY / OP). Cada workgroup escribe 1 partial. static const char* k_body = R"glsl( layout(std430, binding = 0) readonly buffer In { float in_data[]; }; layout(std430, binding = 1) buffer Out { float partials[]; }; uniform uint u_count; shared float sdata[256]; void main() { uint tid = gl_LocalInvocationID.x; uint i = gl_GlobalInvocationID.x; sdata[tid] = (i < u_count) ? in_data[i] : IDENTITY; barrier(); for (uint s = 128u; s > 0u; s >>= 1u) { if (tid < s) sdata[tid] = OP(sdata[tid], sdata[tid + s]); barrier(); } if (tid == 0u) partials[gl_WorkGroupID.x] = sdata[0]; } )glsl"; static unsigned int compile_op(const char* identity, const char* op) { char preamble[256]; std::snprintf(preamble, sizeof(preamble), "#define IDENTITY (%s)\n" "#define OP(a,b) (%s)\n", identity, op); auto r = compile_compute(k_body, kLocal, preamble); if (!r.ok) { std::fprintf(stderr, "[gpu_reduce] compile error: %s\n", r.err_msg.c_str()); return 0; } return r.program; } GpuReduce gpu_reduce_create(int max_n_samples) { GpuReduce r{}; if (max_n_samples <= 0) return r; r.p_sum = compile_op("0.0", "(a) + (b)"); r.p_min = compile_op("3.4028235e38", "min((a),(b))"); r.p_max = compile_op("-3.4028235e38", "max((a),(b))"); if (r.p_sum) r.loc_count = static_cast(glGetUniformLocation(r.p_sum, "u_count")); if (r.p_min) r.loc_count_min = static_cast(glGetUniformLocation(r.p_min, "u_count")); if (r.p_max) r.loc_count_max = static_cast(glGetUniformLocation(r.p_max, "u_count")); r.max_groups = (max_n_samples + kLocal - 1) / kLocal; r.partials = ssbo_create(static_cast(r.max_groups) * sizeof(float), nullptr, GL_DYNAMIC_COPY); return r; } static float reduce_dispatch(unsigned int program, unsigned int loc_count, const Ssbo& partials, int max_groups, const Ssbo& samples, int count, ReduceOp op) { if (program == 0 || count <= 0) { return (op == ReduceOp::Min) ? std::numeric_limits::infinity() : (op == ReduceOp::Max) ? -std::numeric_limits::infinity() : 0.0f; } glUseProgram(program); ssbo_bind(samples, 0); ssbo_bind(partials, 1); glUniform1ui(static_cast(loc_count), static_cast(count)); dispatch_1d(count, kLocal); barrier_buffer_update(); int groups = (count + kLocal - 1) / kLocal; if (groups > max_groups) groups = max_groups; std::vector host(groups); ssbo_readback(partials, 0, static_cast(groups) * sizeof(float), host.data()); if (groups == 0) { return (op == ReduceOp::Min) ? std::numeric_limits::infinity() : (op == ReduceOp::Max) ? -std::numeric_limits::infinity() : 0.0f; } float acc = host[0]; for (int i = 1; i < groups; ++i) { switch (op) { case ReduceOp::Sum: acc += host[i]; break; case ReduceOp::Min: acc = std::min(acc, host[i]); break; case ReduceOp::Max: acc = std::max(acc, host[i]); break; } } return acc; } float gpu_reduce_run(GpuReduce& r, ReduceOp op, const Ssbo& samples, int count) { switch (op) { case ReduceOp::Sum: return reduce_dispatch(r.p_sum, r.loc_count, r.partials, r.max_groups, samples, count, op); case ReduceOp::Min: return reduce_dispatch(r.p_min, r.loc_count_min, r.partials, r.max_groups, samples, count, op); case ReduceOp::Max: return reduce_dispatch(r.p_max, r.loc_count_max, r.partials, r.max_groups, samples, count, op); } return 0.0f; } float gpu_reduce_mean(GpuReduce& r, const Ssbo& samples, int count) { if (count <= 0) return 0.0f; float s = gpu_reduce_run(r, ReduceOp::Sum, samples, count); return s / static_cast(count); } void gpu_reduce_destroy(GpuReduce& r) { delete_compute_program(r.p_sum); delete_compute_program(r.p_min); delete_compute_program(r.p_max); r.p_sum = r.p_min = r.p_max = 0; ssbo_destroy(r.partials); r.max_groups = 0; } } // namespace fn::gfx