perf(viz): graph_renderer Tier 1 (RGBA8 + orphan + frustum cull) + force_layout auto-pause helper

Issue 0049c. Tres optimizaciones internas en graph_renderer.cpp + un helper puro en graph_force_layout para detectar convergencia. API publica intacta — solo cambian el layout interno de los buffers, el shader y los costes por frame. 1. RGBA8 color packing - El instance buffer de nodos pasa de (x,y,size,r,g,b,a) 28B a (x,y,size,color_u32) 16B (-43%). Aristas: 24B → 12B/vertex (-50%). - Shaders desempaquetan con bit shifts (compatible GL 3.30+, no necesita unpackUnorm4x8 que es 4.20+). - Helpers expuestos: pack_rgba8 / unpack_rgba8 / modulate_alpha_rgba8 en graph_renderer.h. Los GraphNode.color y la paleta ya tenian el layout correcto (R en LSB), asi que CPU ahora pasa el uint32 directo sin convertir a 4 floats por nodo y por frame. 2. Capacity-tracked streaming buffers - Sustituye el doble glBufferData de antes por: glBufferData(NULL, capacity, STREAM_DRAW) // orphan + reserva glBufferSubData(0, used_bytes, data) // solo lo usado - capacity crece x2 cuando hace falta (inicial 4096 nodos / 8192 vertices de aristas) → reallocaciones en O(log N). - Staging CPU (NodeInstance* / EdgeVertex*) reusado entre frames con realloc, no malloc/free per frame. 3. Frustum cull (CPU-side) - AABB del viewport en world coords con margen 10%. - Aristas: skip si AABB del segmento no intersecta el viewport. - Nodos: solo los visibles entran al instance buffer; visible_count es el N que pasa a glDrawArraysInstanced. Pop-in de borde mitigado por el margen. 4. graph_force_layout_should_pause(low_frames, min_consecutive) - Helper puro: el caller mantiene el contador, la funcion solo decide si parar. Reemplaza la rama inline en demos_graph.cpp. - Test Catch2 con secuencias artificiales. Tests: test_graph_pack_rgba8 (16401 asserts, 4 cases — roundtrip exhaustivo + alpha modulation + clamp). test_graph_should_pause (3 cases, 14 asserts). Los 29 tests del cpp/tests/ siguen verdes (incluido test_visual con goldens). Bump versiones: - graph_renderer 1.1.0 → 1.2.0 - graph_force_layout 1.0.0 → 1.1.0 (tested: true via should_pause test)
2026-04-29 22:17:13 +02:00
parent 97725e0641
commit 427262b892
12 changed files with 437 additions and 146 deletions
@@ -8,7 +8,9 @@
 #include <cstdlib>
 #include <cstring>
 #include <cstdio>
+#include <cstddef>
 #include <cmath>
+#include <algorithm>

 // ---------------------------------------------------------------------------
 // Community palette (ABGR packed, 10 colors)
@@ -26,6 +28,24 @@ static const uint32_t k_palette[10] = {
    0xFF607D8B  // blue-grey
 };

+// ---------------------------------------------------------------------------
+// Per-instance / per-vertex data layouts
+// ---------------------------------------------------------------------------
+// Tier 1 packing: el color va como uint32 unico en lugar de 4 floats. Reduce
+// el bandwidth de upload en 60% para nodos (28 → 16 bytes/instance) y 50%
+// para aristas (24 → 12 bytes/vertex), y elimina la conversion ABGR→4floats
+// en CPU (los uint32 ya tienen el layout de unpackUnorm4x8 en little-endian).
+struct NodeInstance {  // 16 bytes
+    float    x, y;     // world position
+    float    size;     // diameter
+    uint32_t color;    // packed RGBA8
+};
+
+struct EdgeVertex {    // 12 bytes
+    float    x, y;     // world position
+    uint32_t color;    // packed RGBA8 (alpha ya pre-multiplicada por edge_alpha)
+};
+
 // ---------------------------------------------------------------------------
 // Internal struct
 // ---------------------------------------------------------------------------
@@ -43,6 +63,19 @@ struct GraphRenderer {
    unsigned int edge_vao, edge_vbo;
    unsigned int edge_shader;

+    // Streaming buffer capacities (in bytes). Grow x2 cuando used > capacity.
+    // Mantenemos el VBO orphaned con glBufferData(NULL, capacity) y luego
+    // hacemos glBufferSubData con los bytes realmente usados — evita el
+    // sync stall del driver y reduce las reallocaciones a O(log N).
+    size_t node_vbo_capacity;
+    size_t edge_vbo_capacity;
+
+    // CPU staging buffers — se reusan entre frames; crecen igual que el VBO.
+    NodeInstance* node_staging;
+    size_t        node_staging_cap; // en NodeInstances, no bytes
+    EdgeVertex*   edge_staging;
+    size_t        edge_staging_cap; // en EdgeVertex
+
    GraphRendererConfig config;
 };

@@ -51,15 +84,19 @@ struct GraphRenderer {
 // ---------------------------------------------------------------------------

 // Node vertex shader — instanced unit quad
+// a_color es uint32 packeado (R,G,B,A) — unpackUnorm4x8 esta en GLSL 4.20+,
+// pero en core 3.30 lo hacemos manualmente con bit shifts. Eso mantiene
+// compatibilidad con drivers que no exponen GL 4.x sin tener que tocar
+// fn_framework.
 static const char* k_node_vert = R"(
 #version 330 core
 // Quad corners [-0.5, 0.5]
-layout(location = 0) in vec2 a_quad;
+layout(location = 0) in vec2  a_quad;

-// Per-instance: world position, size, RGBA color
+// Per-instance: world position, size, packed RGBA8 color.
 layout(location = 1) in vec2  a_pos;
 layout(location = 2) in float a_size;
-layout(location = 3) in vec4  a_color;
+layout(location = 3) in uint  a_color;

 out vec2  v_uv;
 out vec4  v_color;
@@ -68,17 +105,23 @@ uniform vec2  u_viewport;   // (width, height) in pixels
 uniform float u_scale;      // cam_zoom
 uniform vec2  u_translate;  // (tx, ty) in pixels

+vec4 unpack_rgba8(uint c) {
+    return vec4(
+        float( c        & 0xFFu),
+        float((c >>  8) & 0xFFu),
+        float((c >> 16) & 0xFFu),
+        float((c >> 24) & 0xFFu)
+    ) * (1.0 / 255.0);
+}
+
 void main() {
-    // World -> screen (pixels)
    vec2 screen = a_pos * u_scale + u_translate;
-    // Expand quad by node radius (size = diameter)
    screen += a_quad * a_size * u_scale;
-    // Screen -> NDC
    vec2 ndc = (screen / u_viewport) * 2.0 - 1.0;
-    ndc.y = -ndc.y; // flip Y (screen Y grows downward)
+    ndc.y = -ndc.y;
    gl_Position = vec4(ndc, 0.0, 1.0);
-    v_uv    = a_quad + 0.5; // [0,1]
-    v_color = a_color;
+    v_uv    = a_quad + 0.5;
+    v_color = unpack_rgba8(a_color);
 }
 )";

@@ -94,33 +137,25 @@ uniform float u_outline_px;  // outline width in uv units
 uniform float u_node_px;     // node diameter in pixels (= size * zoom)

 void main() {
-    // SDF circle centered at (0.5, 0.5) in uv space
    float dist = length(v_uv - 0.5);
    float r    = 0.5;
-
-    // Anti-alias edge (in uv units; 1px ~ 1/u_node_px in uv space)
    float fwidth_uv = 1.5 / max(u_node_px, 1.0);
-
    float alpha = 1.0 - smoothstep(r - fwidth_uv, r, dist);
    if (alpha < 0.001) discard;
-
-    // Outline ring
    float outline_uv = u_outline_px / max(u_node_px, 1.0);
    float outline    = smoothstep(r - outline_uv - fwidth_uv, r - outline_uv, dist);
-
-    vec3 fill    = v_color.rgb;
-    vec3 outline_col = mix(fill, vec3(1.0), 0.6); // lighter outline
-    vec3 color   = mix(fill, outline_col, outline);
-
+    vec3 fill        = v_color.rgb;
+    vec3 outline_col = mix(fill, vec3(1.0), 0.6);
+    vec3 color       = mix(fill, outline_col, outline);
    frag_color = vec4(color, v_color.a * alpha);
 }
 )";

-// Edge vertex shader
+// Edge vertex shader (RGBA8 packed)
 static const char* k_edge_vert = R"(
 #version 330 core
 layout(location = 0) in vec2 a_pos;
-layout(location = 1) in vec4 a_color;
+layout(location = 1) in uint a_color;

 out vec4 v_color;

@@ -128,12 +163,21 @@ uniform vec2  u_viewport;
 uniform float u_scale;
 uniform vec2  u_translate;

+vec4 unpack_rgba8(uint c) {
+    return vec4(
+        float( c        & 0xFFu),
+        float((c >>  8) & 0xFFu),
+        float((c >> 16) & 0xFFu),
+        float((c >> 24) & 0xFFu)
+    ) * (1.0 / 255.0);
+}
+
 void main() {
    vec2 screen = a_pos * u_scale + u_translate;
    vec2 ndc    = (screen / u_viewport) * 2.0 - 1.0;
    ndc.y = -ndc.y;
    gl_Position = vec4(ndc, 0.0, 1.0);
-    v_color = a_color;
+    v_color = unpack_rgba8(a_color);
 }
 )";

@@ -188,7 +232,6 @@ static unsigned int link_program(const char* vert_src, const char* frag_src) {
 // FBO helpers
 // ---------------------------------------------------------------------------
 static void create_fbo(GraphRenderer* r) {
-    // Texture
    glGenTextures(1, &r->texture);
    glBindTexture(GL_TEXTURE_2D, r->texture);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, r->width, r->height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
@@ -196,13 +239,11 @@ static void create_fbo(GraphRenderer* r) {
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glBindTexture(GL_TEXTURE_2D, 0);

-    // Depth renderbuffer
    glGenRenderbuffers(1, &r->rbo);
    glBindRenderbuffer(GL_RENDERBUFFER, r->rbo);
    glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, r->width, r->height);
    glBindRenderbuffer(GL_RENDERBUFFER, 0);

-    // FBO
    glGenFramebuffers(1, &r->fbo);
    glBindFramebuffer(GL_FRAMEBUFFER, r->fbo);
    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, r->texture, 0);
@@ -218,14 +259,16 @@ static void destroy_fbo(GraphRenderer* r) {
 }

 // ---------------------------------------------------------------------------
-// Helper: unpack ABGR uint32 to float RGBA
+// Capacity-tracked streaming helpers
 // ---------------------------------------------------------------------------
-static inline void abgr_to_rgba(uint32_t abgr, float& r, float& g, float& b, float& a) {
-    // ABGR layout: bits 31-24 = A, 23-16 = B, 15-8 = G, 7-0 = R
-    a = ((abgr >> 24) & 0xFF) / 255.0f;
-    b = ((abgr >> 16) & 0xFF) / 255.0f;
-    g = ((abgr >>  8) & 0xFF) / 255.0f;
-    r = ((abgr      ) & 0xFF) / 255.0f;
+// Doblar la capacidad cada vez que el upload supera el VBO. Asi las
+// reallocaciones quedan en O(log N) en el peor caso y en >0 en el regimen
+// estable. Capacidad inicial razonable: 4096 nodos / aristas (segun el .md
+// del issue) — la primera llamada paga el redimensionado si hay mas.
+static size_t grow_capacity(size_t current, size_t needed, size_t initial) {
+    size_t cap = current > 0 ? current : initial;
+    while (cap < needed) cap *= 2;
+    return cap;
 }

 // ---------------------------------------------------------------------------
@@ -238,11 +281,17 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC
    r->height = height;
    r->config = config;

+    r->node_vbo_capacity = 0;
+    r->edge_vbo_capacity = 0;
+    r->node_staging      = nullptr;
+    r->node_staging_cap  = 0;
+    r->edge_staging      = nullptr;
+    r->edge_staging_cap  = 0;
+
    // --- FBO ---
    create_fbo(r);

    // --- Node VAO ---
-    // Unit quad: 4 vertices, each (x, y) in [-0.5, 0.5]
    static const float quad_verts[8] = {
        -0.5f, -0.5f,
         0.5f, -0.5f,
@@ -260,33 +309,41 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC
    glEnableVertexAttribArray(0);
    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0);

-    // Instance VBO (location 1,2,3 — position, size, color)
+    // Instance VBO — layout: NodeInstance (x, y, size, color_u32)
    glGenBuffers(1, &r->node_instance_vbo);
    glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo);
-    // layout: x, y, size, r, g, b, a  — 7 floats per instance
-    glEnableVertexAttribArray(1); // pos
-    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)0);
+    glEnableVertexAttribArray(1); // pos (2 float)
+    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE,
+                          sizeof(NodeInstance),
+                          (void*)offsetof(NodeInstance, x));
    glVertexAttribDivisor(1, 1);
-    glEnableVertexAttribArray(2); // size
-    glVertexAttribPointer(2, 1, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)(2 * sizeof(float)));
+    glEnableVertexAttribArray(2); // size (1 float)
+    glVertexAttribPointer(2, 1, GL_FLOAT, GL_FALSE,
+                          sizeof(NodeInstance),
+                          (void*)offsetof(NodeInstance, size));
    glVertexAttribDivisor(2, 1);
-    glEnableVertexAttribArray(3); // color
-    glVertexAttribPointer(3, 4, GL_FLOAT, GL_FALSE, 7 * sizeof(float), (void*)(3 * sizeof(float)));
+    glEnableVertexAttribArray(3); // color (1 uint32) — IPointer, no normalizado
+    glVertexAttribIPointer(3, 1, GL_UNSIGNED_INT,
+                           sizeof(NodeInstance),
+                           (void*)offsetof(NodeInstance, color));
    glVertexAttribDivisor(3, 1);

    glBindVertexArray(0);

    // --- Edge VAO ---
-    // Each edge: 2 vertices x (x, y, r, g, b, a) = 2 * 6 floats
    glGenVertexArrays(1, &r->edge_vao);
    glBindVertexArray(r->edge_vao);

    glGenBuffers(1, &r->edge_vbo);
    glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
    glEnableVertexAttribArray(0); // pos
-    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)0);
-    glEnableVertexAttribArray(1); // color
-    glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 6 * sizeof(float), (void*)(2 * sizeof(float)));
+    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE,
+                          sizeof(EdgeVertex),
+                          (void*)offsetof(EdgeVertex, x));
+    glEnableVertexAttribArray(1); // color (uint32)
+    glVertexAttribIPointer(1, 1, GL_UNSIGNED_INT,
+                           sizeof(EdgeVertex),
+                           (void*)offsetof(EdgeVertex, color));

    glBindVertexArray(0);

@@ -307,6 +364,8 @@ void graph_renderer_destroy(GraphRenderer* r) {
    glDeleteBuffers(1, &r->edge_vbo);
    glDeleteProgram(r->node_shader);
    glDeleteProgram(r->edge_shader);
+    free(r->node_staging);
+    free(r->edge_staging);
    delete r;
 }

@@ -333,116 +392,147 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph,
    glBindFramebuffer(GL_FRAMEBUFFER, r->fbo);
    glViewport(0, 0, r->width, r->height);

-    // Clear with bg_color (ABGR)
-    float bg_a, bg_b, bg_g, bg_cr;
-    abgr_to_rgba(r->config.bg_color, bg_cr, bg_g, bg_b, bg_a);
-    glClearColor(bg_cr, bg_g, bg_b, bg_a);
+    // Clear with bg_color (interpreted as RGBA8 packed — same memory layout)
+    uint8_t br, bg, bb, ba;
+    unpack_rgba8(r->config.bg_color, br, bg, bb, ba);
+    glClearColor(br / 255.0f, bg / 255.0f, bb / 255.0f, ba / 255.0f);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

-    // Enable blending for anti-aliasing and transparency
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

    // View transform: world -> screen pixels
-    // tx = -cam_x * scale + width/2
-    // ty = -cam_y * scale + height/2
    float scale = cam_zoom;
    float tx = -cam_x * scale + (float)r->width  * 0.5f;
    float ty = -cam_y * scale + (float)r->height * 0.5f;

+    // Frustum cull AABB en world coords. Margen del 10% para que un nodo o
+    // arista a punto de entrar en pantalla no haga pop-in al moverse.
+    float half_w = ((float)r->width  * 0.5f) / std::max(scale, 0.0001f);
+    float half_h = ((float)r->height * 0.5f) / std::max(scale, 0.0001f);
+    const float margin = 0.10f;
+    float vx0 = cam_x - half_w * (1.0f + margin);
+    float vx1 = cam_x + half_w * (1.0f + margin);
+    float vy0 = cam_y - half_h * (1.0f + margin);
+    float vy1 = cam_y + half_h * (1.0f + margin);
+
    // ----------------------------------------------------------------
-    // Draw edges
+    // Draw edges (frustum-culled)
    // ----------------------------------------------------------------
    if (graph.edge_count > 0 && graph.edges && graph.nodes) {
-        // Pack: 2 vertices per edge, each vertex = (x, y, r, g, b, a) = 6 floats
-        const int floats_per_edge = 2 * 6;
-        float* edge_buf = (float*)malloc((size_t)graph.edge_count * floats_per_edge * sizeof(float));
-        int vi = 0;
-        for (int i = 0; i < graph.edge_count; ++i) {
-            const GraphEdge& e = graph.edges[i];
-            uint32_t ecol = e.color != 0 ? e.color : 0xFF888888u; // default gray
-            float er, eg, eb, ea;
-            abgr_to_rgba(ecol, er, eg, eb, ea);
-            ea *= r->config.edge_alpha;
-
-            if (e.source < (uint32_t)graph.node_count && e.target < (uint32_t)graph.node_count) {
-                const GraphNode& ns = graph.nodes[e.source];
-                const GraphNode& nt = graph.nodes[e.target];
-
-                // Source vertex
-                edge_buf[vi++] = ns.x; edge_buf[vi++] = ns.y;
-                edge_buf[vi++] = er;   edge_buf[vi++] = eg;
-                edge_buf[vi++] = eb;   edge_buf[vi++] = ea;
-                // Target vertex
-                edge_buf[vi++] = nt.x; edge_buf[vi++] = nt.y;
-                edge_buf[vi++] = er;   edge_buf[vi++] = eg;
-                edge_buf[vi++] = eb;   edge_buf[vi++] = ea;
-            }
+        // Asegurar staging — capacidad maxima posible en este frame es
+        // edge_count * 2 vertices. La realidad post-cull suele ser mucho
+        // menor, pero reservamos para el peor caso y no realocamos por
+        // frame.
+        size_t need_verts = (size_t)graph.edge_count * 2;
+        if (need_verts > r->edge_staging_cap) {
+            size_t new_cap = grow_capacity(r->edge_staging_cap, need_verts, 8192);
+            r->edge_staging = (EdgeVertex*)realloc(r->edge_staging, new_cap * sizeof(EdgeVertex));
+            r->edge_staging_cap = new_cap;
        }

-        glUseProgram(r->edge_shader);
-        glUniform2f(glGetUniformLocation(r->edge_shader, "u_viewport"), (float)r->width, (float)r->height);
-        glUniform1f(glGetUniformLocation(r->edge_shader, "u_scale"),    scale);
-        glUniform2f(glGetUniformLocation(r->edge_shader, "u_translate"), tx, ty);
+        size_t out = 0;
+        for (int i = 0; i < graph.edge_count; ++i) {
+            const GraphEdge& e = graph.edges[i];
+            if (e.source >= (uint32_t)graph.node_count) continue;
+            if (e.target >= (uint32_t)graph.node_count) continue;

-        glLineWidth(r->config.edge_width);
+            const GraphNode& ns = graph.nodes[e.source];
+            const GraphNode& nt = graph.nodes[e.target];

-        glBindVertexArray(r->edge_vao);
-        glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
-        // Orphan: descarta el buffer anterior antes de subir el nuevo. Evita
-        // que el driver bloquee esperando que termine el frame previo (sync
-        // stall) y nos da un VBO fresco. Coste: ~0; ganancia: 2-3x upload
-        // throughput en drivers que respetan el hint (Mesa, NVIDIA, AMD).
-        glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), nullptr, GL_DYNAMIC_DRAW);
-        glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), edge_buf, GL_DYNAMIC_DRAW);
-        glDrawArrays(GL_LINES, 0, vi / 6);
-        glBindVertexArray(0);
+            // Frustum cull: AABB del segmento (con margen para edges casi
+            // tangentes al viewport). Si el AABB no intersecta el viewport,
+            // skip — la arista no contribuye a ningun pixel visible.
+            float ex0 = std::min(ns.x, nt.x);
+            float ex1 = std::max(ns.x, nt.x);
+            float ey0 = std::min(ns.y, nt.y);
+            float ey1 = std::max(ns.y, nt.y);
+            if (ex1 < vx0 || ex0 > vx1 || ey1 < vy0 || ey0 > vy1) continue;

-        free(edge_buf);
+            uint32_t ecol = e.color != 0 ? e.color : pack_rgba8(0x88, 0x88, 0x88, 0xFF);
+            uint32_t col  = modulate_alpha_rgba8(ecol, r->config.edge_alpha);
+
+            r->edge_staging[out++] = { ns.x, ns.y, col };
+            r->edge_staging[out++] = { nt.x, nt.y, col };
+        }
+
+        if (out > 0) {
+            const size_t used_bytes = out * sizeof(EdgeVertex);
+            if (used_bytes > r->edge_vbo_capacity) {
+                r->edge_vbo_capacity = grow_capacity(r->edge_vbo_capacity, used_bytes,
+                                                     8192 * sizeof(EdgeVertex));
+            }
+
+            glUseProgram(r->edge_shader);
+            glUniform2f(glGetUniformLocation(r->edge_shader, "u_viewport"),
+                        (float)r->width, (float)r->height);
+            glUniform1f(glGetUniformLocation(r->edge_shader, "u_scale"),    scale);
+            glUniform2f(glGetUniformLocation(r->edge_shader, "u_translate"), tx, ty);
+
+            glLineWidth(r->config.edge_width);
+
+            glBindVertexArray(r->edge_vao);
+            glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
+            // Orphan: descarta el contenido previo y damos al driver un
+            // buffer fresco con la capacidad reservada. Despues subimos
+            // solo los bytes realmente usados con BufferSubData — evitamos
+            // el sync stall y reutilizamos la asignacion entre frames
+            // mientras no crezca.
+            glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->edge_vbo_capacity, nullptr, GL_STREAM_DRAW);
+            glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes, r->edge_staging);
+            glDrawArrays(GL_LINES, 0, (GLsizei)out);
+            glBindVertexArray(0);
+        }
    }

    // ----------------------------------------------------------------
-    // Draw nodes (instanced quads)
+    // Draw nodes (instanced quads, frustum-culled)
    // ----------------------------------------------------------------
    if (graph.node_count > 0 && graph.nodes) {
-        // Pack: 7 floats per node: x, y, size, r, g, b, a
-        float* node_buf = (float*)malloc((size_t)graph.node_count * 7 * sizeof(float));
-        for (int i = 0; i < graph.node_count; ++i) {
-            const GraphNode& n = graph.nodes[i];
-            uint32_t ncol = n.color != 0 ? n.color : k_palette[n.community % 10];
-            float nr, ng, nb, na;
-            abgr_to_rgba(ncol, nr, ng, nb, na);
-
-            float sz = n.size > 0.0f ? n.size : 4.0f;
-            float* p = node_buf + i * 7;
-            p[0] = n.x; p[1] = n.y; p[2] = sz;
-            p[3] = nr;  p[4] = ng;  p[5] = nb; p[6] = na;
+        if ((size_t)graph.node_count > r->node_staging_cap) {
+            size_t new_cap = grow_capacity(r->node_staging_cap, (size_t)graph.node_count, 4096);
+            r->node_staging = (NodeInstance*)realloc(r->node_staging, new_cap * sizeof(NodeInstance));
+            r->node_staging_cap = new_cap;
        }

-        glUseProgram(r->node_shader);
-        glUniform2f(glGetUniformLocation(r->node_shader, "u_viewport"),  (float)r->width, (float)r->height);
-        glUniform1f(glGetUniformLocation(r->node_shader, "u_scale"),     scale);
-        glUniform2f(glGetUniformLocation(r->node_shader, "u_translate"), tx, ty);
-        glUniform1f(glGetUniformLocation(r->node_shader, "u_outline_px"), r->config.node_outline);
+        size_t visible = 0;
+        for (int i = 0; i < graph.node_count; ++i) {
+            const GraphNode& n = graph.nodes[i];
+            float sz = n.size > 0.0f ? n.size : 4.0f;
+            float half = sz * 0.5f;
+            // AABB del nodo: centro ± half. Skip si fuera del viewport.
+            if (n.x + half < vx0 || n.x - half > vx1) continue;
+            if (n.y + half < vy0 || n.y - half > vy1) continue;

-        glBindVertexArray(r->node_vao);
-        glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo);
-        // Orphan + reupload (ver comentario en edge upload arriba).
-        const GLsizeiptr node_bytes = graph.node_count * 7 * (GLsizeiptr)sizeof(float);
-        glBufferData(GL_ARRAY_BUFFER, node_bytes, nullptr,  GL_DYNAMIC_DRAW);
-        glBufferData(GL_ARRAY_BUFFER, node_bytes, node_buf, GL_DYNAMIC_DRAW);
+            uint32_t ncol = n.color != 0 ? n.color : k_palette[n.community % 10];
+            r->node_staging[visible++] = { n.x, n.y, sz, ncol };
+        }

-        // Draw 4 vertices (triangle strip quad) x node_count instances
-        // Pass per-instance node_px uniform via the average size (approximation)
-        // For exact per-node pixel size we'd need a texture or another approach;
-        // use a uniform average for AA quality — good enough for most graphs.
-        float avg_px = 8.0f * scale; // rough estimate
-        glUniform1f(glGetUniformLocation(r->node_shader, "u_node_px"), avg_px);
+        if (visible > 0) {
+            const size_t used_bytes = visible * sizeof(NodeInstance);
+            if (used_bytes > r->node_vbo_capacity) {
+                r->node_vbo_capacity = grow_capacity(r->node_vbo_capacity, used_bytes,
+                                                     4096 * sizeof(NodeInstance));
+            }

-        glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, graph.node_count);
-        glBindVertexArray(0);
+            glUseProgram(r->node_shader);
+            glUniform2f(glGetUniformLocation(r->node_shader, "u_viewport"),
+                        (float)r->width, (float)r->height);
+            glUniform1f(glGetUniformLocation(r->node_shader, "u_scale"),     scale);
+            glUniform2f(glGetUniformLocation(r->node_shader, "u_translate"), tx, ty);
+            glUniform1f(glGetUniformLocation(r->node_shader, "u_outline_px"), r->config.node_outline);

-        free(node_buf);
+            glBindVertexArray(r->node_vao);
+            glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo);
+            glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->node_vbo_capacity, nullptr, GL_STREAM_DRAW);
+            glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes, r->node_staging);
+
+            float avg_px = 8.0f * scale; // estimacion para el AA del SDF
+            glUniform1f(glGetUniformLocation(r->node_shader, "u_node_px"), avg_px);
+
+            glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, (GLsizei)visible);
+            glBindVertexArray(0);
+        }
    }

    // --- Restore GL state ---