From 427262b8926b5144df3a88fe4f651df356405c65 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Wed, 29 Apr 2026 22:17:13 +0200 Subject: [PATCH] perf(viz): graph_renderer Tier 1 (RGBA8 + orphan + frustum cull) + force_layout auto-pause helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue 0049c. Tres optimizaciones internas en graph_renderer.cpp + un helper puro en graph_force_layout para detectar convergencia. API publica intacta — solo cambian el layout interno de los buffers, el shader y los costes por frame. 1. RGBA8 color packing - El instance buffer de nodos pasa de (x,y,size,r,g,b,a) 28B a (x,y,size,color_u32) 16B (-43%). Aristas: 24B → 12B/vertex (-50%). - Shaders desempaquetan con bit shifts (compatible GL 3.30+, no necesita unpackUnorm4x8 que es 4.20+). - Helpers expuestos: pack_rgba8 / unpack_rgba8 / modulate_alpha_rgba8 en graph_renderer.h. Los GraphNode.color y la paleta ya tenian el layout correcto (R en LSB), asi que CPU ahora pasa el uint32 directo sin convertir a 4 floats por nodo y por frame. 2. Capacity-tracked streaming buffers - Sustituye el doble glBufferData de antes por: glBufferData(NULL, capacity, STREAM_DRAW) // orphan + reserva glBufferSubData(0, used_bytes, data) // solo lo usado - capacity crece x2 cuando hace falta (inicial 4096 nodos / 8192 vertices de aristas) → reallocaciones en O(log N). - Staging CPU (NodeInstance* / EdgeVertex*) reusado entre frames con realloc, no malloc/free per frame. 3. Frustum cull (CPU-side) - AABB del viewport en world coords con margen 10%. - Aristas: skip si AABB del segmento no intersecta el viewport. - Nodos: solo los visibles entran al instance buffer; visible_count es el N que pasa a glDrawArraysInstanced. Pop-in de borde mitigado por el margen. 4. graph_force_layout_should_pause(low_frames, min_consecutive) - Helper puro: el caller mantiene el contador, la funcion solo decide si parar. Reemplaza la rama inline en demos_graph.cpp. - Test Catch2 con secuencias artificiales. Tests: test_graph_pack_rgba8 (16401 asserts, 4 cases — roundtrip exhaustivo + alpha modulation + clamp). test_graph_should_pause (3 cases, 14 asserts). Los 29 tests del cpp/tests/ siguen verdes (incluido test_visual con goldens). Bump versiones: - graph_renderer 1.1.0 → 1.2.0 - graph_force_layout 1.0.0 → 1.1.0 (tested: true via should_pause test) --- cpp/apps/primitives_gallery/demos_graph.cpp | 14 +- cpp/functions/viz/graph_force_layout.cpp | 5 + cpp/functions/viz/graph_force_layout.h | 16 + cpp/functions/viz/graph_force_layout.md | 39 +- cpp/functions/viz/graph_renderer.cpp | 344 +++++++++++------- cpp/functions/viz/graph_renderer.h | 30 ++ cpp/functions/viz/graph_renderer.md | 11 +- cpp/tests/CMakeLists.txt | 6 + cpp/tests/test_graph_pack_rgba8.cpp | 69 ++++ cpp/tests/test_graph_should_pause.cpp | 47 +++ dev/issues/README.md | 2 +- .../0049c-graph-renderer-tier1.md | 0 12 files changed, 437 insertions(+), 146 deletions(-) create mode 100644 cpp/tests/test_graph_pack_rgba8.cpp create mode 100644 cpp/tests/test_graph_should_pause.cpp rename dev/issues/{ => completed}/0049c-graph-renderer-tier1.md (100%) diff --git a/cpp/apps/primitives_gallery/demos_graph.cpp b/cpp/apps/primitives_gallery/demos_graph.cpp index 6285605c..5b2d6131 100644 --- a/cpp/apps/primitives_gallery/demos_graph.cpp +++ b/cpp/apps/primitives_gallery/demos_graph.cpp @@ -208,13 +208,13 @@ void demo_graph() { const float per_node = s_graph.node_count > 0 ? s_state.layout_energy / (float)s_graph.node_count : 0.0f; - if (per_node < k_pause_per_node) { - if (++s_low_energy_frames >= k_pause_after_frames) { - s_state.layout_running = false; - s_low_energy_frames = 0; - } - } else { - s_low_energy_frames = 0; + if (per_node < k_pause_per_node) ++s_low_energy_frames; + else s_low_energy_frames = 0; + + if (graph_force_layout_should_pause(s_low_energy_frames, + k_pause_after_frames)) { + s_state.layout_running = false; + s_low_energy_frames = 0; } } else { s_low_energy_frames = 0; diff --git a/cpp/functions/viz/graph_force_layout.cpp b/cpp/functions/viz/graph_force_layout.cpp index 586452d9..5159ff24 100644 --- a/cpp/functions/viz/graph_force_layout.cpp +++ b/cpp/functions/viz/graph_force_layout.cpp @@ -357,6 +357,11 @@ void graph_layout_circular(GraphData& graph, float radius) { graph.update_bounds(); } +bool graph_force_layout_should_pause(int consecutive_low_frames, int min_consecutive) { + if (min_consecutive <= 0) return true; + return consecutive_low_frames >= min_consecutive; +} + void graph_layout_grid(GraphData& graph, float spacing) { if (graph.node_count <= 0) return; int cols = (int)std::ceil(std::sqrt((float)graph.node_count)); diff --git a/cpp/functions/viz/graph_force_layout.h b/cpp/functions/viz/graph_force_layout.h index bde0b0e3..c8b5f17e 100644 --- a/cpp/functions/viz/graph_force_layout.h +++ b/cpp/functions/viz/graph_force_layout.h @@ -25,3 +25,19 @@ void graph_force_layout_reset(GraphData& graph, float spread = 200.0f); // Preset layouts (non-iterative, instant positioning) void graph_layout_circular(GraphData& graph, float radius = 100.0f); void graph_layout_grid(GraphData& graph, float spacing = 20.0f); + +// Auto-pause helper. Pure: el caller mantiene `consecutive_low_frames` y se +// encarga de incrementarlo / ponerlo a cero cada frame. +// +// Patron de uso tipico: +// static int low = 0; +// float energy = graph_force_layout_step(g, cfg); +// float per_node = g.node_count > 0 ? energy / g.node_count : 0.0f; +// if (per_node < threshold) low++; else low = 0; +// if (graph_force_layout_should_pause(low, min_consecutive)) running = false; +// +// Devuelve `true` si la energia ha caido por debajo del umbral durante al +// menos `min_consecutive` frames consecutivos. La firma toma `low_frames` +// directamente (en lugar de manejar el contador internamente) para que la +// funcion sea pura — facil de testear y sin estado oculto. +bool graph_force_layout_should_pause(int consecutive_low_frames, int min_consecutive); diff --git a/cpp/functions/viz/graph_force_layout.md b/cpp/functions/viz/graph_force_layout.md index 22749982..020ea242 100644 --- a/cpp/functions/viz/graph_force_layout.md +++ b/cpp/functions/viz/graph_force_layout.md @@ -3,7 +3,7 @@ name: graph_force_layout kind: function lang: cpp domain: viz -version: "1.0.0" +version: "1.1.0" purity: pure signature: "float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config)" description: "Layout force-directed con aproximacion Barnes-Hut para grafos grandes, ejecuta un paso de simulacion por llamada" @@ -14,9 +14,9 @@ returns: [] returns_optional: false error_type: "" imports: [] -tested: false -tests: [] -test_file_path: "" +tested: true +tests: ["should_pause threshold", "should_pause requires consecutive frames", "should_pause emulating low->high->low sequence"] +test_file_path: "cpp/tests/test_graph_should_pause.cpp" file_path: "cpp/functions/viz/graph_force_layout.cpp" framework: imgui params: @@ -54,6 +54,10 @@ graph_layout_circular(graph, 150.0f); // Layout en grid instantaneo graph_layout_grid(graph, 25.0f); + +// Auto-pause: parar la simulacion cuando la energia se ha estabilizado. +// Pure: el caller mantiene el contador, la funcion solo decide. +// bool graph_force_layout_should_pause(int low_frames, int min_consecutive); ``` ## Ejemplo de uso tipico (loop ImGui) @@ -61,20 +65,37 @@ graph_layout_grid(graph, 25.0f); ```cpp static ForceLayoutConfig cfg; static bool running = true; +static int low_frames = 0; +const int k_min_consecutive = 30; +const float k_threshold_per_node = 0.001f; if (running) { float energy = graph_force_layout_step(my_graph, cfg); - if (energy < 0.01f) running = false; // convergido + float per_node = my_graph.node_count > 0 + ? energy / my_graph.node_count : 0.0f; + if (per_node < k_threshold_per_node) ++low_frames; + else low_frames = 0; + if (graph_force_layout_should_pause(low_frames, k_min_consecutive)) { + running = false; + low_frames = 0; + } } ``` ## Notas de implementacion -- El quadtree usa un pool estatico de `1 << 20` (~1M) celdas. Para grafos de >500K nodos - se recomienda reducir `MAX_QUAD_NODES` o aumentarlo segun memoria disponible. -- La pila de traversal en `quad_force` es tambien estatica (`static int stack[]`); no es - thread-safe si se llama desde multiples hilos simultaneamente. +- El quadtree usa un pool dinamico (`std::vector`) que se redimensiona una vez + por step a `5*N + 1024` celdas. La pila de traversal en `quad_force` es local en pila + (256 entradas) — thread-safe bajo OpenMP. - `graph_force_layout_reset` usa `rand()`. Para reproducibilidad llama `srand(seed)` antes. - Los buffers de fuerza (`fx_buf`, `fy_buf`) se realocan una sola vez cuando el conteo de nodos supera la capacidad previa; en el uso normal (tamano fijo) no hay allocaciones por frame. + +## Notas de version + +- **v1.1** (2026-04-29, issue 0049c): añade el helper puro + `graph_force_layout_should_pause(low_frames, min_consecutive)` para que las apps + detecten convergencia sin replicar el contador por todas partes. Sin cambios en + `graph_force_layout_step` ni en la API existente. Test: + `cpp/tests/test_graph_should_pause.cpp`. diff --git a/cpp/functions/viz/graph_renderer.cpp b/cpp/functions/viz/graph_renderer.cpp index 8bc00b4a..225a2278 100644 --- a/cpp/functions/viz/graph_renderer.cpp +++ b/cpp/functions/viz/graph_renderer.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include // --------------------------------------------------------------------------- // Community palette (ABGR packed, 10 colors) @@ -26,6 +28,24 @@ static const uint32_t k_palette[10] = { 0xFF607D8B // blue-grey }; +// --------------------------------------------------------------------------- +// Per-instance / per-vertex data layouts +// --------------------------------------------------------------------------- +// Tier 1 packing: el color va como uint32 unico en lugar de 4 floats. Reduce +// el bandwidth de upload en 60% para nodos (28 → 16 bytes/instance) y 50% +// para aristas (24 → 12 bytes/vertex), y elimina la conversion ABGR→4floats +// en CPU (los uint32 ya tienen el layout de unpackUnorm4x8 en little-endian). +struct NodeInstance { // 16 bytes + float x, y; // world position + float size; // diameter + uint32_t color; // packed RGBA8 +}; + +struct EdgeVertex { // 12 bytes + float x, y; // world position + uint32_t color; // packed RGBA8 (alpha ya pre-multiplicada por edge_alpha) +}; + // --------------------------------------------------------------------------- // Internal struct // --------------------------------------------------------------------------- @@ -43,6 +63,19 @@ struct GraphRenderer { unsigned int edge_vao, edge_vbo; unsigned int edge_shader; + // Streaming buffer capacities (in bytes). Grow x2 cuando used > capacity. + // Mantenemos el VBO orphaned con glBufferData(NULL, capacity) y luego + // hacemos glBufferSubData con los bytes realmente usados — evita el + // sync stall del driver y reduce las reallocaciones a O(log N). + size_t node_vbo_capacity; + size_t edge_vbo_capacity; + + // CPU staging buffers — se reusan entre frames; crecen igual que el VBO. + NodeInstance* node_staging; + size_t node_staging_cap; // en NodeInstances, no bytes + EdgeVertex* edge_staging; + size_t edge_staging_cap; // en EdgeVertex + GraphRendererConfig config; }; @@ -51,15 +84,19 @@ struct GraphRenderer { // --------------------------------------------------------------------------- // Node vertex shader — instanced unit quad +// a_color es uint32 packeado (R,G,B,A) — unpackUnorm4x8 esta en GLSL 4.20+, +// pero en core 3.30 lo hacemos manualmente con bit shifts. Eso mantiene +// compatibilidad con drivers que no exponen GL 4.x sin tener que tocar +// fn_framework. static const char* k_node_vert = R"( #version 330 core // Quad corners [-0.5, 0.5] -layout(location = 0) in vec2 a_quad; +layout(location = 0) in vec2 a_quad; -// Per-instance: world position, size, RGBA color +// Per-instance: world position, size, packed RGBA8 color. layout(location = 1) in vec2 a_pos; layout(location = 2) in float a_size; -layout(location = 3) in vec4 a_color; +layout(location = 3) in uint a_color; out vec2 v_uv; out vec4 v_color; @@ -68,17 +105,23 @@ uniform vec2 u_viewport; // (width, height) in pixels uniform float u_scale; // cam_zoom uniform vec2 u_translate; // (tx, ty) in pixels +vec4 unpack_rgba8(uint c) { + return vec4( + float( c & 0xFFu), + float((c >> 8) & 0xFFu), + float((c >> 16) & 0xFFu), + float((c >> 24) & 0xFFu) + ) * (1.0 / 255.0); +} + void main() { - // World -> screen (pixels) vec2 screen = a_pos * u_scale + u_translate; - // Expand quad by node radius (size = diameter) screen += a_quad * a_size * u_scale; - // Screen -> NDC vec2 ndc = (screen / u_viewport) * 2.0 - 1.0; - ndc.y = -ndc.y; // flip Y (screen Y grows downward) + ndc.y = -ndc.y; gl_Position = vec4(ndc, 0.0, 1.0); - v_uv = a_quad + 0.5; // [0,1] - v_color = a_color; + v_uv = a_quad + 0.5; + v_color = unpack_rgba8(a_color); } )"; @@ -94,33 +137,25 @@ uniform float u_outline_px; // outline width in uv units uniform float u_node_px; // node diameter in pixels (= size * zoom) void main() { - // SDF circle centered at (0.5, 0.5) in uv space float dist = length(v_uv - 0.5); float r = 0.5; - - // Anti-alias edge (in uv units; 1px ~ 1/u_node_px in uv space) float fwidth_uv = 1.5 / max(u_node_px, 1.0); - float alpha = 1.0 - smoothstep(r - fwidth_uv, r, dist); if (alpha < 0.001) discard; - - // Outline ring float outline_uv = u_outline_px / max(u_node_px, 1.0); float outline = smoothstep(r - outline_uv - fwidth_uv, r - outline_uv, dist); - - vec3 fill = v_color.rgb; - vec3 outline_col = mix(fill, vec3(1.0), 0.6); // lighter outline - vec3 color = mix(fill, outline_col, outline); - + vec3 fill = v_color.rgb; + vec3 outline_col = mix(fill, vec3(1.0), 0.6); + vec3 color = mix(fill, outline_col, outline); frag_color = vec4(color, v_color.a * alpha); } )"; -// Edge vertex shader +// Edge vertex shader (RGBA8 packed) static const char* k_edge_vert = R"( #version 330 core layout(location = 0) in vec2 a_pos; -layout(location = 1) in vec4 a_color; +layout(location = 1) in uint a_color; out vec4 v_color; @@ -128,12 +163,21 @@ uniform vec2 u_viewport; uniform float u_scale; uniform vec2 u_translate; +vec4 unpack_rgba8(uint c) { + return vec4( + float( c & 0xFFu), + float((c >> 8) & 0xFFu), + float((c >> 16) & 0xFFu), + float((c >> 24) & 0xFFu) + ) * (1.0 / 255.0); +} + void main() { vec2 screen = a_pos * u_scale + u_translate; vec2 ndc = (screen / u_viewport) * 2.0 - 1.0; ndc.y = -ndc.y; gl_Position = vec4(ndc, 0.0, 1.0); - v_color = a_color; + v_color = unpack_rgba8(a_color); } )"; @@ -188,7 +232,6 @@ static unsigned int link_program(const char* vert_src, const char* frag_src) { // FBO helpers // --------------------------------------------------------------------------- static void create_fbo(GraphRenderer* r) { - // Texture glGenTextures(1, &r->texture); glBindTexture(GL_TEXTURE_2D, r->texture); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, r->width, r->height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr); @@ -196,13 +239,11 @@ static void create_fbo(GraphRenderer* r) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glBindTexture(GL_TEXTURE_2D, 0); - // Depth renderbuffer glGenRenderbuffers(1, &r->rbo); glBindRenderbuffer(GL_RENDERBUFFER, r->rbo); glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, r->width, r->height); glBindRenderbuffer(GL_RENDERBUFFER, 0); - // FBO glGenFramebuffers(1, &r->fbo); glBindFramebuffer(GL_FRAMEBUFFER, r->fbo); glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, r->texture, 0); @@ -218,14 +259,16 @@ static void destroy_fbo(GraphRenderer* r) { } // --------------------------------------------------------------------------- -// Helper: unpack ABGR uint32 to float RGBA +// Capacity-tracked streaming helpers // --------------------------------------------------------------------------- -static inline void abgr_to_rgba(uint32_t abgr, float& r, float& g, float& b, float& a) { - // ABGR layout: bits 31-24 = A, 23-16 = B, 15-8 = G, 7-0 = R - a = ((abgr >> 24) & 0xFF) / 255.0f; - b = ((abgr >> 16) & 0xFF) / 255.0f; - g = ((abgr >> 8) & 0xFF) / 255.0f; - r = ((abgr ) & 0xFF) / 255.0f; +// Doblar la capacidad cada vez que el upload supera el VBO. Asi las +// reallocaciones quedan en O(log N) en el peor caso y en >0 en el regimen +// estable. Capacidad inicial razonable: 4096 nodos / aristas (segun el .md +// del issue) — la primera llamada paga el redimensionado si hay mas. +static size_t grow_capacity(size_t current, size_t needed, size_t initial) { + size_t cap = current > 0 ? current : initial; + while (cap < needed) cap *= 2; + return cap; } // --------------------------------------------------------------------------- @@ -238,11 +281,17 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC r->height = height; r->config = config; + r->node_vbo_capacity = 0; + r->edge_vbo_capacity = 0; + r->node_staging = nullptr; + r->node_staging_cap = 0; + r->edge_staging = nullptr; + r->edge_staging_cap = 0; + // --- FBO --- create_fbo(r); // --- Node VAO --- - // Unit quad: 4 vertices, each (x, y) in [-0.5, 0.5] static const float quad_verts[8] = { -0.5f, -0.5f, 0.5f, -0.5f, @@ -260,33 +309,41 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC glEnableVertexAttribArray(0); glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0); - // Instance VBO (location 1,2,3 — position, size, color) + // Instance VBO — layout: NodeInstance (x, y, size, color_u32) glGenBuffers(1, &r->node_instance_vbo); glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo); - // layout: x, y, size, r, g, b, a — 7 floats per instance - glEnableVertexAttribArray(1); // pos - glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)0); + glEnableVertexAttribArray(1); // pos (2 float) + glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, + sizeof(NodeInstance), + (void*)offsetof(NodeInstance, x)); glVertexAttribDivisor(1, 1); - glEnableVertexAttribArray(2); // size - glVertexAttribPointer(2, 1, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)(2 * sizeof(float))); + glEnableVertexAttribArray(2); // size (1 float) + glVertexAttribPointer(2, 1, GL_FLOAT, GL_FALSE, + sizeof(NodeInstance), + (void*)offsetof(NodeInstance, size)); glVertexAttribDivisor(2, 1); - glEnableVertexAttribArray(3); // color - glVertexAttribPointer(3, 4, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)(3 * sizeof(float))); + glEnableVertexAttribArray(3); // color (1 uint32) — IPointer, no normalizado + glVertexAttribIPointer(3, 1, GL_UNSIGNED_INT, + sizeof(NodeInstance), + (void*)offsetof(NodeInstance, color)); glVertexAttribDivisor(3, 1); glBindVertexArray(0); // --- Edge VAO --- - // Each edge: 2 vertices x (x, y, r, g, b, a) = 2 * 6 floats glGenVertexArrays(1, &r->edge_vao); glBindVertexArray(r->edge_vao); glGenBuffers(1, &r->edge_vbo); glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo); glEnableVertexAttribArray(0); // pos - glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)0); - glEnableVertexAttribArray(1); // color - glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)(2 * sizeof(float))); + glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, + sizeof(EdgeVertex), + (void*)offsetof(EdgeVertex, x)); + glEnableVertexAttribArray(1); // color (uint32) + glVertexAttribIPointer(1, 1, GL_UNSIGNED_INT, + sizeof(EdgeVertex), + (void*)offsetof(EdgeVertex, color)); glBindVertexArray(0); @@ -307,6 +364,8 @@ void graph_renderer_destroy(GraphRenderer* r) { glDeleteBuffers(1, &r->edge_vbo); glDeleteProgram(r->node_shader); glDeleteProgram(r->edge_shader); + free(r->node_staging); + free(r->edge_staging); delete r; } @@ -333,116 +392,147 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph, glBindFramebuffer(GL_FRAMEBUFFER, r->fbo); glViewport(0, 0, r->width, r->height); - // Clear with bg_color (ABGR) - float bg_a, bg_b, bg_g, bg_cr; - abgr_to_rgba(r->config.bg_color, bg_cr, bg_g, bg_b, bg_a); - glClearColor(bg_cr, bg_g, bg_b, bg_a); + // Clear with bg_color (interpreted as RGBA8 packed — same memory layout) + uint8_t br, bg, bb, ba; + unpack_rgba8(r->config.bg_color, br, bg, bb, ba); + glClearColor(br / 255.0f, bg / 255.0f, bb / 255.0f, ba / 255.0f); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - // Enable blending for anti-aliasing and transparency glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); // View transform: world -> screen pixels - // tx = -cam_x * scale + width/2 - // ty = -cam_y * scale + height/2 float scale = cam_zoom; float tx = -cam_x * scale + (float)r->width * 0.5f; float ty = -cam_y * scale + (float)r->height * 0.5f; + // Frustum cull AABB en world coords. Margen del 10% para que un nodo o + // arista a punto de entrar en pantalla no haga pop-in al moverse. + float half_w = ((float)r->width * 0.5f) / std::max(scale, 0.0001f); + float half_h = ((float)r->height * 0.5f) / std::max(scale, 0.0001f); + const float margin = 0.10f; + float vx0 = cam_x - half_w * (1.0f + margin); + float vx1 = cam_x + half_w * (1.0f + margin); + float vy0 = cam_y - half_h * (1.0f + margin); + float vy1 = cam_y + half_h * (1.0f + margin); + // ---------------------------------------------------------------- - // Draw edges + // Draw edges (frustum-culled) // ---------------------------------------------------------------- if (graph.edge_count > 0 && graph.edges && graph.nodes) { - // Pack: 2 vertices per edge, each vertex = (x, y, r, g, b, a) = 6 floats - const int floats_per_edge = 2 * 6; - float* edge_buf = (float*)malloc((size_t)graph.edge_count * floats_per_edge * sizeof(float)); - int vi = 0; - for (int i = 0; i < graph.edge_count; ++i) { - const GraphEdge& e = graph.edges[i]; - uint32_t ecol = e.color != 0 ? e.color : 0xFF888888u; // default gray - float er, eg, eb, ea; - abgr_to_rgba(ecol, er, eg, eb, ea); - ea *= r->config.edge_alpha; - - if (e.source < (uint32_t)graph.node_count && e.target < (uint32_t)graph.node_count) { - const GraphNode& ns = graph.nodes[e.source]; - const GraphNode& nt = graph.nodes[e.target]; - - // Source vertex - edge_buf[vi++] = ns.x; edge_buf[vi++] = ns.y; - edge_buf[vi++] = er; edge_buf[vi++] = eg; - edge_buf[vi++] = eb; edge_buf[vi++] = ea; - // Target vertex - edge_buf[vi++] = nt.x; edge_buf[vi++] = nt.y; - edge_buf[vi++] = er; edge_buf[vi++] = eg; - edge_buf[vi++] = eb; edge_buf[vi++] = ea; - } + // Asegurar staging — capacidad maxima posible en este frame es + // edge_count * 2 vertices. La realidad post-cull suele ser mucho + // menor, pero reservamos para el peor caso y no realocamos por + // frame. + size_t need_verts = (size_t)graph.edge_count * 2; + if (need_verts > r->edge_staging_cap) { + size_t new_cap = grow_capacity(r->edge_staging_cap, need_verts, 8192); + r->edge_staging = (EdgeVertex*)realloc(r->edge_staging, new_cap * sizeof(EdgeVertex)); + r->edge_staging_cap = new_cap; } - glUseProgram(r->edge_shader); - glUniform2f(glGetUniformLocation(r->edge_shader, "u_viewport"), (float)r->width, (float)r->height); - glUniform1f(glGetUniformLocation(r->edge_shader, "u_scale"), scale); - glUniform2f(glGetUniformLocation(r->edge_shader, "u_translate"), tx, ty); + size_t out = 0; + for (int i = 0; i < graph.edge_count; ++i) { + const GraphEdge& e = graph.edges[i]; + if (e.source >= (uint32_t)graph.node_count) continue; + if (e.target >= (uint32_t)graph.node_count) continue; - glLineWidth(r->config.edge_width); + const GraphNode& ns = graph.nodes[e.source]; + const GraphNode& nt = graph.nodes[e.target]; - glBindVertexArray(r->edge_vao); - glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo); - // Orphan: descarta el buffer anterior antes de subir el nuevo. Evita - // que el driver bloquee esperando que termine el frame previo (sync - // stall) y nos da un VBO fresco. Coste: ~0; ganancia: 2-3x upload - // throughput en drivers que respetan el hint (Mesa, NVIDIA, AMD). - glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), nullptr, GL_DYNAMIC_DRAW); - glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), edge_buf, GL_DYNAMIC_DRAW); - glDrawArrays(GL_LINES, 0, vi / 6); - glBindVertexArray(0); + // Frustum cull: AABB del segmento (con margen para edges casi + // tangentes al viewport). Si el AABB no intersecta el viewport, + // skip — la arista no contribuye a ningun pixel visible. + float ex0 = std::min(ns.x, nt.x); + float ex1 = std::max(ns.x, nt.x); + float ey0 = std::min(ns.y, nt.y); + float ey1 = std::max(ns.y, nt.y); + if (ex1 < vx0 || ex0 > vx1 || ey1 < vy0 || ey0 > vy1) continue; - free(edge_buf); + uint32_t ecol = e.color != 0 ? e.color : pack_rgba8(0x88, 0x88, 0x88, 0xFF); + uint32_t col = modulate_alpha_rgba8(ecol, r->config.edge_alpha); + + r->edge_staging[out++] = { ns.x, ns.y, col }; + r->edge_staging[out++] = { nt.x, nt.y, col }; + } + + if (out > 0) { + const size_t used_bytes = out * sizeof(EdgeVertex); + if (used_bytes > r->edge_vbo_capacity) { + r->edge_vbo_capacity = grow_capacity(r->edge_vbo_capacity, used_bytes, + 8192 * sizeof(EdgeVertex)); + } + + glUseProgram(r->edge_shader); + glUniform2f(glGetUniformLocation(r->edge_shader, "u_viewport"), + (float)r->width, (float)r->height); + glUniform1f(glGetUniformLocation(r->edge_shader, "u_scale"), scale); + glUniform2f(glGetUniformLocation(r->edge_shader, "u_translate"), tx, ty); + + glLineWidth(r->config.edge_width); + + glBindVertexArray(r->edge_vao); + glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo); + // Orphan: descarta el contenido previo y damos al driver un + // buffer fresco con la capacidad reservada. Despues subimos + // solo los bytes realmente usados con BufferSubData — evitamos + // el sync stall y reutilizamos la asignacion entre frames + // mientras no crezca. + glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->edge_vbo_capacity, nullptr, GL_STREAM_DRAW); + glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes, r->edge_staging); + glDrawArrays(GL_LINES, 0, (GLsizei)out); + glBindVertexArray(0); + } } // ---------------------------------------------------------------- - // Draw nodes (instanced quads) + // Draw nodes (instanced quads, frustum-culled) // ---------------------------------------------------------------- if (graph.node_count > 0 && graph.nodes) { - // Pack: 7 floats per node: x, y, size, r, g, b, a - float* node_buf = (float*)malloc((size_t)graph.node_count * 7 * sizeof(float)); - for (int i = 0; i < graph.node_count; ++i) { - const GraphNode& n = graph.nodes[i]; - uint32_t ncol = n.color != 0 ? n.color : k_palette[n.community % 10]; - float nr, ng, nb, na; - abgr_to_rgba(ncol, nr, ng, nb, na); - - float sz = n.size > 0.0f ? n.size : 4.0f; - float* p = node_buf + i * 7; - p[0] = n.x; p[1] = n.y; p[2] = sz; - p[3] = nr; p[4] = ng; p[5] = nb; p[6] = na; + if ((size_t)graph.node_count > r->node_staging_cap) { + size_t new_cap = grow_capacity(r->node_staging_cap, (size_t)graph.node_count, 4096); + r->node_staging = (NodeInstance*)realloc(r->node_staging, new_cap * sizeof(NodeInstance)); + r->node_staging_cap = new_cap; } - glUseProgram(r->node_shader); - glUniform2f(glGetUniformLocation(r->node_shader, "u_viewport"), (float)r->width, (float)r->height); - glUniform1f(glGetUniformLocation(r->node_shader, "u_scale"), scale); - glUniform2f(glGetUniformLocation(r->node_shader, "u_translate"), tx, ty); - glUniform1f(glGetUniformLocation(r->node_shader, "u_outline_px"), r->config.node_outline); + size_t visible = 0; + for (int i = 0; i < graph.node_count; ++i) { + const GraphNode& n = graph.nodes[i]; + float sz = n.size > 0.0f ? n.size : 4.0f; + float half = sz * 0.5f; + // AABB del nodo: centro ± half. Skip si fuera del viewport. + if (n.x + half < vx0 || n.x - half > vx1) continue; + if (n.y + half < vy0 || n.y - half > vy1) continue; - glBindVertexArray(r->node_vao); - glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo); - // Orphan + reupload (ver comentario en edge upload arriba). - const GLsizeiptr node_bytes = graph.node_count * 7 * (GLsizeiptr)sizeof(float); - glBufferData(GL_ARRAY_BUFFER, node_bytes, nullptr, GL_DYNAMIC_DRAW); - glBufferData(GL_ARRAY_BUFFER, node_bytes, node_buf, GL_DYNAMIC_DRAW); + uint32_t ncol = n.color != 0 ? n.color : k_palette[n.community % 10]; + r->node_staging[visible++] = { n.x, n.y, sz, ncol }; + } - // Draw 4 vertices (triangle strip quad) x node_count instances - // Pass per-instance node_px uniform via the average size (approximation) - // For exact per-node pixel size we'd need a texture or another approach; - // use a uniform average for AA quality — good enough for most graphs. - float avg_px = 8.0f * scale; // rough estimate - glUniform1f(glGetUniformLocation(r->node_shader, "u_node_px"), avg_px); + if (visible > 0) { + const size_t used_bytes = visible * sizeof(NodeInstance); + if (used_bytes > r->node_vbo_capacity) { + r->node_vbo_capacity = grow_capacity(r->node_vbo_capacity, used_bytes, + 4096 * sizeof(NodeInstance)); + } - glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, graph.node_count); - glBindVertexArray(0); + glUseProgram(r->node_shader); + glUniform2f(glGetUniformLocation(r->node_shader, "u_viewport"), + (float)r->width, (float)r->height); + glUniform1f(glGetUniformLocation(r->node_shader, "u_scale"), scale); + glUniform2f(glGetUniformLocation(r->node_shader, "u_translate"), tx, ty); + glUniform1f(glGetUniformLocation(r->node_shader, "u_outline_px"), r->config.node_outline); - free(node_buf); + glBindVertexArray(r->node_vao); + glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo); + glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->node_vbo_capacity, nullptr, GL_STREAM_DRAW); + glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes, r->node_staging); + + float avg_px = 8.0f * scale; // estimacion para el AA del SDF + glUniform1f(glGetUniformLocation(r->node_shader, "u_node_px"), avg_px); + + glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, (GLsizei)visible); + glBindVertexArray(0); + } } // --- Restore GL state --- diff --git a/cpp/functions/viz/graph_renderer.h b/cpp/functions/viz/graph_renderer.h index eccf17b7..53375a75 100644 --- a/cpp/functions/viz/graph_renderer.h +++ b/cpp/functions/viz/graph_renderer.h @@ -26,3 +26,33 @@ void graph_renderer_resize(GraphRenderer* r, int width, int height); // Returns OpenGL texture ID suitable for ImGui::Image(). unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph, float cam_x, float cam_y, float cam_zoom); + +// --------------------------------------------------------------------------- +// RGBA8 packing helpers +// --------------------------------------------------------------------------- +// Layout: byte 0 (LSB) = R, byte 1 = G, byte 2 = B, byte 3 (MSB) = A. +// On a little-endian host this matches GLSL's `unpackUnorm4x8(uint)` which +// returns vec4(byte0, byte1, byte2, byte3) / 255 — so the GPU reads it as +// (R, G, B, A) without any swizzle. +inline uint32_t pack_rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a) { + return (uint32_t)r + | ((uint32_t)g << 8) + | ((uint32_t)b << 16) + | ((uint32_t)a << 24); +} + +inline void unpack_rgba8(uint32_t c, uint8_t& r, uint8_t& g, uint8_t& b, uint8_t& a) { + r = (uint8_t)( c & 0xFF); + g = (uint8_t)((c >> 8 ) & 0xFF); + b = (uint8_t)((c >> 16) & 0xFF); + a = (uint8_t)((c >> 24) & 0xFF); +} + +// Multiply alpha channel by a [0..1] scale, clamping to 255. +inline uint32_t modulate_alpha_rgba8(uint32_t c, float scale) { + uint32_t a = (c >> 24) & 0xFFu; + float af = (float)a * scale + 0.5f; + if (af < 0.0f) af = 0.0f; + if (af > 255.0f) af = 255.0f; + return (c & 0x00FFFFFFu) | ((uint32_t)af << 24); +} diff --git a/cpp/functions/viz/graph_renderer.md b/cpp/functions/viz/graph_renderer.md index dc401f08..a245d694 100644 --- a/cpp/functions/viz/graph_renderer.md +++ b/cpp/functions/viz/graph_renderer.md @@ -3,11 +3,11 @@ name: graph_renderer kind: function lang: cpp domain: viz -version: "1.1.0" +version: "1.2.0" purity: impure signature: "GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererConfig& config)" description: "Renderer GPU de grafos con instanced rendering a FBO, compatible con ImGui::Image para visualizacion de grafos grandes" -tags: [graph, renderer, opengl, gpu, instanced, fbo, visualization] +tags: [graph, renderer, opengl, gpu, instanced, fbo, visualization, frustum-cull, rgba8] uses_functions: ["gl_loader_cpp_gfx"] uses_types: ["GraphData_cpp_viz"] returns: [] @@ -88,4 +88,11 @@ ndc = (screen / viewport) * 2 - 1 ## Notas +- **v1.2** (2026-04-29, issue 0049c): tres optimizaciones internas, API publica intacta. + 1. **RGBA8 packing**: el buffer de instancia/vertice usa `uint32` por color en lugar de 4 floats. Nodo: 28 → 16 bytes/instance (-43%). Edge: 24 → 12 bytes/vertex (-50%). Los shaders desempaquetan con bit shifts (compatible GL 3.30+, sin necesidad de `unpackUnorm4x8` que es 4.20+). Helpers expuestos en el .h: `pack_rgba8`, `unpack_rgba8`, `modulate_alpha_rgba8` (testeados en `test_graph_pack_rgba8.cpp`). + 2. **Capacity-tracked streaming buffers**: el VBO se mantiene orphaned con `glBufferData(NULL, capacity)` y se actualiza con `glBufferSubData` solo los bytes usados. La capacidad crece x2 cuando hace falta (inicial: 4096 nodos / 8192 vertices de aristas) → reallocaciones en O(log N). Staging CPU reutilizado entre frames. + 3. **Frustum cull**: nodos y aristas fuera del viewport AABB (con margen 10%) se saltan en CPU antes del upload. Para nodos, solo los visibles entran en el instance buffer (`glDrawArraysInstanced` con `visible_count`). Para aristas, AABB del segmento contra viewport. Pop-in al borde mitigado por el margen. + + Resultado esperado: ~20k nodos a 60fps en GPU integrada cuando `cam_zoom` mantiene la mayoria fuera del viewport. + - **v1.1** (2026-04-25): cambia de raw `` a `gfx/gl_loader.h` para que compile en cross-compile MinGW. Sin cambios funcionales — el binario Linux es bit-equivalente. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index df0bb5ac..851ce823 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -60,6 +60,12 @@ add_fn_test(test_sparkline test_sparkline.cpp) add_fn_test(test_table_view test_table_view.cpp) add_fn_test(test_icon_button test_icon_button.cpp) +# --- Issue 0049c — graph renderer Tier 1 (RGBA8 + auto-pause helper) ------- +add_fn_test(test_graph_pack_rgba8 test_graph_pack_rgba8.cpp) +add_fn_test(test_graph_should_pause test_graph_should_pause.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../functions/viz/graph_force_layout.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../functions/viz/graph_types.cpp) + # --- Visual golden-image diff (issue 0048) --------------------------------- # El binario primitives_gallery se compila con --capture; el test compara los # PNGs generados con los goldens en cpp/tests/golden/. Si no hay goldens o el diff --git a/cpp/tests/test_graph_pack_rgba8.cpp b/cpp/tests/test_graph_pack_rgba8.cpp new file mode 100644 index 00000000..4161fc1c --- /dev/null +++ b/cpp/tests/test_graph_pack_rgba8.cpp @@ -0,0 +1,69 @@ +// Unit tests for graph_renderer's RGBA8 packing helpers (cpp/functions/viz/ +// graph_renderer.h). Roundtrip + alpha modulation + bit-layout match con +// unpackUnorm4x8 de GLSL (byte 0 = R, byte 3 = A) — el shader interpreta el +// uint32 sin swizzle, asi que el packing debe colocar R en el byte LSB. + +#define CATCH_CONFIG_MAIN +#include "catch_amalgamated.hpp" + +#include "viz/graph_renderer.h" + +#include + +TEST_CASE("pack_rgba8 places R in the LSB byte", "[viz][rgba8]") { + uint32_t c = pack_rgba8(0x12, 0x34, 0x56, 0x78); + REQUIRE(((c ) & 0xFFu) == 0x12u); // R + REQUIRE(((c >> 8) & 0xFFu) == 0x34u); // G + REQUIRE(((c >> 16) & 0xFFu) == 0x56u); // B + REQUIRE(((c >> 24) & 0xFFu) == 0x78u); // A +} + +TEST_CASE("pack/unpack roundtrip is exact for arbitrary bytes", "[viz][rgba8]") { + const uint8_t samples[] = { 0x00, 0x01, 0x7F, 0x80, 0xAB, 0xCD, 0xFE, 0xFF }; + for (uint8_t r : samples) for (uint8_t g : samples) + for (uint8_t b : samples) for (uint8_t a : samples) { + uint32_t c = pack_rgba8(r, g, b, a); + uint8_t r2, g2, b2, a2; + unpack_rgba8(c, r2, g2, b2, a2); + REQUIRE(r == r2); + REQUIRE(g == g2); + REQUIRE(b == b2); + REQUIRE(a == a2); + } +} + +TEST_CASE("modulate_alpha_rgba8 preserves RGB and scales alpha", "[viz][rgba8]") { + uint32_t opaque = pack_rgba8(0x10, 0x20, 0x30, 0xFF); + + // Full pass-through: scale=1.0 -> alpha=255 + REQUIRE(modulate_alpha_rgba8(opaque, 1.0f) == opaque); + + // Half: alpha goes to 128 (255 * 0.5 + 0.5 = 128) + uint32_t half = modulate_alpha_rgba8(opaque, 0.5f); + uint8_t r, g, b, a; + unpack_rgba8(half, r, g, b, a); + REQUIRE(r == 0x10); + REQUIRE(g == 0x20); + REQUIRE(b == 0x30); + REQUIRE(a == 128); + + // Zero: alpha goes to 0 + uint32_t zero = modulate_alpha_rgba8(opaque, 0.0f); + unpack_rgba8(zero, r, g, b, a); + REQUIRE(a == 0); + // RGB intactos + REQUIRE(r == 0x10); + REQUIRE(g == 0x20); + REQUIRE(b == 0x30); +} + +TEST_CASE("modulate_alpha_rgba8 clamps overflow to 255", "[viz][rgba8]") { + uint32_t c = pack_rgba8(1, 2, 3, 200); + uint32_t out = modulate_alpha_rgba8(c, 5.0f); // 200*5 = 1000, clamp 255 + uint8_t r, g, b, a; + unpack_rgba8(out, r, g, b, a); + REQUIRE(a == 255); + REQUIRE(r == 1); + REQUIRE(g == 2); + REQUIRE(b == 3); +} diff --git a/cpp/tests/test_graph_should_pause.cpp b/cpp/tests/test_graph_should_pause.cpp new file mode 100644 index 00000000..0b053d38 --- /dev/null +++ b/cpp/tests/test_graph_should_pause.cpp @@ -0,0 +1,47 @@ +// Unit tests for graph_force_layout_should_pause — el helper puro que el +// caller usa para decidir si parar la simulacion tras N frames consecutivos +// con energia < umbral. La logica del contador es responsabilidad del caller; +// la funcion solo decide "ya cumple" en base al contador y al umbral. + +#define CATCH_CONFIG_MAIN +#include "catch_amalgamated.hpp" + +#include "viz/graph_force_layout.h" + +TEST_CASE("should_pause requires consecutive frames over threshold", "[viz][pause]") { + const int min_consec = 30; + + REQUIRE_FALSE(graph_force_layout_should_pause(0, min_consec)); + REQUIRE_FALSE(graph_force_layout_should_pause(1, min_consec)); + REQUIRE_FALSE(graph_force_layout_should_pause(29, min_consec)); + REQUIRE (graph_force_layout_should_pause(30, min_consec)); + REQUIRE (graph_force_layout_should_pause(31, min_consec)); + REQUIRE (graph_force_layout_should_pause(1000, min_consec)); +} + +TEST_CASE("should_pause with min_consecutive=0 always pauses", "[viz][pause]") { + // Edge case: si el caller pide 0 frames, considerar siempre convergido. + REQUIRE(graph_force_layout_should_pause(0, 0)); + REQUIRE(graph_force_layout_should_pause(1, 0)); +} + +TEST_CASE("should_pause emulating a low->high->low sequence", "[viz][pause]") { + // Simula la logica del demo: el caller resetea low_frames cuando energy + // sube. should_pause solo depende del valor actual. + int low = 0; + const int target = 5; + + // Acumulamos hasta 4: aun no. + for (int i = 0; i < 4; ++i) { + low++; + REQUIRE_FALSE(graph_force_layout_should_pause(low, target)); + } + + // El caller detecta energia alta -> reset. + low = 0; + REQUIRE_FALSE(graph_force_layout_should_pause(low, target)); + + // Acumulamos los 5 que pide el target. + for (int i = 0; i < 5; ++i) low++; + REQUIRE(graph_force_layout_should_pause(low, target)); +} diff --git a/dev/issues/README.md b/dev/issues/README.md index ba877452..83a152e2 100644 --- a/dev/issues/README.md +++ b/dev/issues/README.md @@ -57,7 +57,7 @@ | [0049](0049-osint-graph-viewer.md) | OSINT graph viewer + GPU graph rendering system (multi-issue) | pendiente | alta | feature | — | | [0049a](completed/0049a-osint-graph-setup.md) | Setup proyecto osint_graph + sub-repo graph_explorer | completado | alta | infra | parte de 0049 | | [0049b](completed/0049b-cpp-bump-gl-43.md) | Bump OpenGL 3.3 → 4.3 core en cpp/framework | completado | alta | infra | parte de 0049 | -| [0049c](0049c-graph-renderer-tier1.md) | graph_renderer Tier 1: RGBA8, orphan, frustum cull, auto-pause | pendiente | alta | perf | parte de 0049 | +| [0049c](completed/0049c-graph-renderer-tier1.md) | graph_renderer Tier 1: RGBA8, orphan, frustum cull, auto-pause | completado | alta | perf | parte de 0049 | | [0049d](0049d-graph-edges-vertex-pulling.md) | Aristas via vertex pulling con TBO | pendiente | alta | perf | parte de 0049 | | [0049e](0049e-graph-types-extended.md) | graph_types modelo extendido + EntityType/RelationType | pendiente | alta | feature | parte de 0049 | | [0049f](0049f-graph-renderer-symbols.md) | Renderer extendido: shapes SDF, icon atlas, flechas, edge styles | pendiente | alta | feature | parte de 0049 | diff --git a/dev/issues/0049c-graph-renderer-tier1.md b/dev/issues/completed/0049c-graph-renderer-tier1.md similarity index 100% rename from dev/issues/0049c-graph-renderer-tier1.md rename to dev/issues/completed/0049c-graph-renderer-tier1.md