perf(viz): graph_renderer edges via TBO + vertex pulling (issue 0049d)

El buffer de aristas pasa a estatico (16B/arista: source, target, color,
flags) y solo se reupload cuando cambia el grafo. Las posiciones de los
nodos viven en un Texture Buffer Object (RG32F) actualizado por frame; el
vertex shader hace texelFetch con gl_VertexID & 1 para elegir endpoint.
Draw call: glDrawArraysInstanced(GL_LINES, 0, 2, edge_count) con divisor=1.

Para 100k aristas: el upload de 4.8 MB/frame baja a 0 en regimen estable.
edge_alpha pasa a uniform; la pre-multiplicacion en CPU desaparece. GLSL
sigue en 330 core (samplerBuffer/texelFetch estan en 1.40+).

gl_loader gana glBufferSubData, glVertexAttribIPointer y glTexBuffer (en
Linux ya estaban via GL_GLEXT_PROTOTYPES; ahora estan disponibles tambien
en MinGW/Windows).

Tests: nuevo test_graph_edge_static valida el layout de 16B y el packing
RGBA8 del fallback. test_visual sigue verde — render visualmente identico.

Bump graph_renderer 1.2.0 -> 1.3.0.
This commit is contained in:
2026-04-29 22:32:38 +02:00
parent b156942cea
commit daf491cd99
8 changed files with 298 additions and 86 deletions
+210 -83
View File
@@ -41,9 +41,15 @@ struct NodeInstance { // 16 bytes
uint32_t color; // packed RGBA8
};
struct EdgeVertex { // 12 bytes
float x, y; // world position
uint32_t color; // packed RGBA8 (alpha ya pre-multiplicada por edge_alpha)
// Tier 2 (issue 0049d): aristas via vertex pulling. El buffer es estatico —
// solo `(source_idx, target_idx, color, flags)` por arista, 16 bytes — y
// se reuploads solo cuando cambia el grafo. El vertex shader hace fetch de
// las posiciones desde un TBO RG32F que SI se actualiza por frame.
struct EdgeStatic { // 16 bytes
uint32_t source; // index into nodes
uint32_t target; // index into nodes
uint32_t color; // packed RGBA8 (sin pre-multiplicar — el shader aplica edge_alpha)
uint32_t flags; // reservado para flechas/styles futuros
};
// ---------------------------------------------------------------------------
@@ -59,22 +65,44 @@ struct GraphRenderer {
unsigned int node_vao, node_quad_vbo, node_instance_vbo;
unsigned int node_shader;
// Edge rendering (lines)
// Edge rendering (vertex pulling — issue 0049d)
// edge_vao : VAO con atributos por-instancia (divisor=1) leyendo de edge_static_vbo
// edge_vbo : buffer estatico (uno por grafo) con (source, target, color, flags)
// node_pos_buf / node_pos_tex : TBO RG32F que el vertex shader muestrea via texelFetch
unsigned int edge_vao, edge_vbo;
unsigned int edge_shader;
unsigned int node_pos_buf;
unsigned int node_pos_tex;
int edge_u_viewport_loc;
int edge_u_scale_loc;
int edge_u_translate_loc;
int edge_u_alpha_loc;
int edge_u_node_pos_loc;
// Streaming buffer capacities (in bytes). Grow x2 cuando used > capacity.
// Mantenemos el VBO orphaned con glBufferData(NULL, capacity) y luego
// hacemos glBufferSubData con los bytes realmente usados — evita el
// sync stall del driver y reduce las reallocaciones a O(log N).
size_t node_vbo_capacity;
size_t edge_vbo_capacity;
size_t node_pos_capacity; // bytes del TBO RG32F
size_t edge_static_capacity; // bytes del buffer estatico de aristas
// CPU staging buffers — se reusan entre frames; crecen igual que el VBO.
NodeInstance* node_staging;
size_t node_staging_cap; // en NodeInstances, no bytes
EdgeVertex* edge_staging;
size_t edge_staging_cap; // en EdgeVertex
float* node_pos_staging; // 2 floats (x,y) por nodo
size_t node_pos_staging_cap; // en floats
EdgeStatic* edge_static_staging;
size_t edge_static_staging_cap; // en EdgeStatic
// Cache para detectar cambios del grafo y reuploadear el edge_vbo
// estatico solo entonces. Identificamos el grafo por (puntero, count);
// basta para los flujos actuales (graph_viewport recrea el array al
// recargar). Cuando GraphData gane un campo `revision` se sustituira.
const void* cached_edges_ptr;
int cached_edge_count; // edges del grafo en el ultimo upload
int cached_edges_drawn; // edges realmente subidos (post-filtro)
bool edges_uploaded;
GraphRendererConfig config;
};
@@ -151,17 +179,33 @@ void main() {
}
)";
// Edge vertex shader (RGBA8 packed)
// Edge vertex shader — vertex pulling (issue 0049d).
// El buffer de aristas es estatico: solo indices y color. Las posiciones
// vienen del TBO `u_node_pos` (RG32F, vec2 por nodo). gl_VertexID indica si
// dibujamos el endpoint source (0) o target (1). Asi eliminamos el upload
// de `12 floats × E` por frame que dominaba el coste de aristas.
//
// Nota: usamos divisor=1 en los 4 atributos y `glDrawArraysInstanced(LINES,
// 0, 2, edge_count)` — cada instancia rinde una linea de 2 vertices, los
// atributos se mantienen constantes en la instancia y `gl_VertexID` cicla
// 0..1 dentro de ella.
//
// `samplerBuffer` y `texelFetch(samplerBuffer, int)` estan en GLSL 1.40+;
// 330 core nos vale (no necesitamos 4.30 — el issue exageraba).
static const char* k_edge_vert = R"(
#version 330 core
layout(location = 0) in vec2 a_pos;
layout(location = 1) in uint a_color;
out vec4 v_color;
layout(location = 0) in uint a_source;
layout(location = 1) in uint a_target;
layout(location = 2) in uint a_color;
// location 3 (flags) reservado en el buffer (16B alignment) pero no leido aqui.
uniform samplerBuffer u_node_pos;
uniform vec2 u_viewport;
uniform float u_scale;
uniform vec2 u_translate;
uniform float u_alpha; // edge_alpha
out vec4 v_color;
vec4 unpack_rgba8(uint c) {
return vec4(
@@ -173,11 +217,16 @@ vec4 unpack_rgba8(uint c) {
}
void main() {
vec2 screen = a_pos * u_scale + u_translate;
int idx = (gl_VertexID & 1) == 0 ? int(a_source) : int(a_target);
vec2 wpos = texelFetch(u_node_pos, idx).xy;
vec2 screen = wpos * u_scale + u_translate;
vec2 ndc = (screen / u_viewport) * 2.0 - 1.0;
ndc.y = -ndc.y;
gl_Position = vec4(ndc, 0.0, 1.0);
v_color = unpack_rgba8(a_color);
vec4 c = unpack_rgba8(a_color);
c.a *= u_alpha;
v_color = c;
}
)";
@@ -281,12 +330,19 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC
r->height = height;
r->config = config;
r->node_vbo_capacity = 0;
r->edge_vbo_capacity = 0;
r->node_staging = nullptr;
r->node_staging_cap = 0;
r->edge_staging = nullptr;
r->edge_staging_cap = 0;
r->node_vbo_capacity = 0;
r->node_pos_capacity = 0;
r->edge_static_capacity = 0;
r->node_staging = nullptr;
r->node_staging_cap = 0;
r->node_pos_staging = nullptr;
r->node_pos_staging_cap = 0;
r->edge_static_staging = nullptr;
r->edge_static_staging_cap = 0;
r->cached_edges_ptr = nullptr;
r->cached_edge_count = 0;
r->cached_edges_drawn = 0;
r->edges_uploaded = false;
// --- FBO ---
create_fbo(r);
@@ -330,27 +386,56 @@ GraphRenderer* graph_renderer_create(int width, int height, const GraphRendererC
glBindVertexArray(0);
// --- Edge VAO ---
// --- Edge VAO (vertex pulling, divisor=1 sobre el buffer estatico) ---
glGenVertexArrays(1, &r->edge_vao);
glBindVertexArray(r->edge_vao);
glGenBuffers(1, &r->edge_vbo);
glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
glEnableVertexAttribArray(0); // pos
glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE,
sizeof(EdgeVertex),
(void*)offsetof(EdgeVertex, x));
glEnableVertexAttribArray(1); // color (uint32)
glVertexAttribIPointer(1, 1, GL_UNSIGNED_INT,
sizeof(EdgeVertex),
(void*)offsetof(EdgeVertex, color));
// (source, target, color, flags) — los 4 con divisor=1.
glEnableVertexAttribArray(0);
glVertexAttribIPointer(0, 1, GL_UNSIGNED_INT, sizeof(EdgeStatic),
(void*)offsetof(EdgeStatic, source));
glVertexAttribDivisor(0, 1);
glEnableVertexAttribArray(1);
glVertexAttribIPointer(1, 1, GL_UNSIGNED_INT, sizeof(EdgeStatic),
(void*)offsetof(EdgeStatic, target));
glVertexAttribDivisor(1, 1);
glEnableVertexAttribArray(2);
glVertexAttribIPointer(2, 1, GL_UNSIGNED_INT, sizeof(EdgeStatic),
(void*)offsetof(EdgeStatic, color));
glVertexAttribDivisor(2, 1);
// location 3 reservado en el buffer pero no enabled — el shader actual
// no lo lee. Mantenemos el slot para futuros estilos/flechas.
glBindVertexArray(0);
// --- TBO de posiciones de nodos (RG32F, vec2 por nodo) ---
glGenBuffers(1, &r->node_pos_buf);
glBindBuffer(GL_TEXTURE_BUFFER, r->node_pos_buf);
// Reservamos capacidad inicial; se redimensiona en draw segun N.
glBufferData(GL_TEXTURE_BUFFER, 4096 * 2 * sizeof(float), nullptr, GL_STREAM_DRAW);
r->node_pos_capacity = 4096 * 2 * sizeof(float);
glGenTextures(1, &r->node_pos_tex);
glBindTexture(GL_TEXTURE_BUFFER, r->node_pos_tex);
glTexBuffer(GL_TEXTURE_BUFFER, GL_RG32F, r->node_pos_buf);
glBindTexture(GL_TEXTURE_BUFFER, 0);
glBindBuffer(GL_TEXTURE_BUFFER, 0);
// --- Shaders ---
r->node_shader = link_program(k_node_vert, k_node_frag);
r->edge_shader = link_program(k_edge_vert, k_edge_frag);
// Cachear locations de uniforms del edge shader (issue 0049d): se
// resuelven una vez en lugar de glGetUniformLocation cada frame.
r->edge_u_viewport_loc = glGetUniformLocation(r->edge_shader, "u_viewport");
r->edge_u_scale_loc = glGetUniformLocation(r->edge_shader, "u_scale");
r->edge_u_translate_loc = glGetUniformLocation(r->edge_shader, "u_translate");
r->edge_u_alpha_loc = glGetUniformLocation(r->edge_shader, "u_alpha");
r->edge_u_node_pos_loc = glGetUniformLocation(r->edge_shader, "u_node_pos");
return r;
}
@@ -362,10 +447,13 @@ void graph_renderer_destroy(GraphRenderer* r) {
glDeleteBuffers(1, &r->node_instance_vbo);
glDeleteVertexArrays(1, &r->edge_vao);
glDeleteBuffers(1, &r->edge_vbo);
glDeleteBuffers(1, &r->node_pos_buf);
glDeleteTextures(1, &r->node_pos_tex);
glDeleteProgram(r->node_shader);
glDeleteProgram(r->edge_shader);
free(r->node_staging);
free(r->edge_staging);
free(r->node_pos_staging);
free(r->edge_static_staging);
delete r;
}
@@ -417,72 +505,111 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph,
float vy1 = cam_y + half_h * (1.0f + margin);
// ----------------------------------------------------------------
// Draw edges (frustum-culled)
// Subir posiciones de nodos al TBO (vec2 por nodo). Lo necesitamos
// tanto si dibujamos aristas (vertex pulling) como antes de dibujar
// nodos — pero se calcula una sola vez por frame.
// ----------------------------------------------------------------
if (graph.edge_count > 0 && graph.edges && graph.nodes) {
// Asegurar staging — capacidad maxima posible en este frame es
// edge_count * 2 vertices. La realidad post-cull suele ser mucho
// menor, pero reservamos para el peor caso y no realocamos por
// frame.
size_t need_verts = (size_t)graph.edge_count * 2;
if (need_verts > r->edge_staging_cap) {
size_t new_cap = grow_capacity(r->edge_staging_cap, need_verts, 8192);
r->edge_staging = (EdgeVertex*)realloc(r->edge_staging, new_cap * sizeof(EdgeVertex));
r->edge_staging_cap = new_cap;
bool tbo_ready = false;
if (graph.node_count > 0 && graph.nodes) {
size_t need_floats = (size_t)graph.node_count * 2;
if (need_floats > r->node_pos_staging_cap) {
size_t new_cap = grow_capacity(r->node_pos_staging_cap, need_floats, 8192);
r->node_pos_staging = (float*)realloc(r->node_pos_staging, new_cap * sizeof(float));
r->node_pos_staging_cap = new_cap;
}
size_t out = 0;
for (int i = 0; i < graph.edge_count; ++i) {
const GraphEdge& e = graph.edges[i];
if (e.source >= (uint32_t)graph.node_count) continue;
if (e.target >= (uint32_t)graph.node_count) continue;
const GraphNode& ns = graph.nodes[e.source];
const GraphNode& nt = graph.nodes[e.target];
// Frustum cull: AABB del segmento (con margen para edges casi
// tangentes al viewport). Si el AABB no intersecta el viewport,
// skip — la arista no contribuye a ningun pixel visible.
float ex0 = std::min(ns.x, nt.x);
float ex1 = std::max(ns.x, nt.x);
float ey0 = std::min(ns.y, nt.y);
float ey1 = std::max(ns.y, nt.y);
if (ex1 < vx0 || ex0 > vx1 || ey1 < vy0 || ey0 > vy1) continue;
uint32_t ecol = e.color != 0 ? e.color : pack_rgba8(0x88, 0x88, 0x88, 0xFF);
uint32_t col = modulate_alpha_rgba8(ecol, r->config.edge_alpha);
r->edge_staging[out++] = { ns.x, ns.y, col };
r->edge_staging[out++] = { nt.x, nt.y, col };
for (int i = 0; i < graph.node_count; ++i) {
r->node_pos_staging[i * 2 + 0] = graph.nodes[i].x;
r->node_pos_staging[i * 2 + 1] = graph.nodes[i].y;
}
const size_t used_bytes = need_floats * sizeof(float);
if (used_bytes > r->node_pos_capacity) {
r->node_pos_capacity = grow_capacity(r->node_pos_capacity, used_bytes,
4096 * 2 * sizeof(float));
}
glBindBuffer(GL_TEXTURE_BUFFER, r->node_pos_buf);
// Orphan + subdata: misma estrategia que en 0049c, evita stall.
glBufferData(GL_TEXTURE_BUFFER, (GLsizeiptr)r->node_pos_capacity, nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_TEXTURE_BUFFER, 0, (GLsizeiptr)used_bytes, r->node_pos_staging);
// glTexBuffer ya esta vinculado al buffer en create — el view sigue
// valido tras orphan: GL_TEXTURE_BUFFER referencia al BO por nombre.
glBindBuffer(GL_TEXTURE_BUFFER, 0);
tbo_ready = true;
}
if (out > 0) {
const size_t used_bytes = out * sizeof(EdgeVertex);
if (used_bytes > r->edge_vbo_capacity) {
r->edge_vbo_capacity = grow_capacity(r->edge_vbo_capacity, used_bytes,
8192 * sizeof(EdgeVertex));
// ----------------------------------------------------------------
// Aristas via vertex pulling. El buffer estatico solo se reupload
// cuando el grafo cambia — detectamos con (puntero, count).
// ----------------------------------------------------------------
if (tbo_ready && graph.edge_count > 0 && graph.edges) {
const bool graph_changed =
!r->edges_uploaded
|| r->cached_edges_ptr != (const void*)graph.edges
|| r->cached_edge_count != graph.edge_count;
if (graph_changed) {
// (Re)build el buffer estatico. Skipeamos aristas con indices
// fuera de rango — pueden aparecer durante una recarga parcial
// del grafo y no queremos que el GPU lea fuera del TBO.
if ((size_t)graph.edge_count > r->edge_static_staging_cap) {
size_t new_cap = grow_capacity(r->edge_static_staging_cap,
(size_t)graph.edge_count, 8192);
r->edge_static_staging = (EdgeStatic*)realloc(r->edge_static_staging,
new_cap * sizeof(EdgeStatic));
r->edge_static_staging_cap = new_cap;
}
size_t out = 0;
for (int i = 0; i < graph.edge_count; ++i) {
const GraphEdge& e = graph.edges[i];
if (e.source >= (uint32_t)graph.node_count) continue;
if (e.target >= (uint32_t)graph.node_count) continue;
uint32_t col = e.color != 0 ? e.color
: pack_rgba8(0x88, 0x88, 0x88, 0xFF);
r->edge_static_staging[out++] = { e.source, e.target, col, 0u };
}
if (out > 0) {
const size_t used_bytes = out * sizeof(EdgeStatic);
if (used_bytes > r->edge_static_capacity) {
r->edge_static_capacity = grow_capacity(r->edge_static_capacity,
used_bytes,
8192 * sizeof(EdgeStatic));
}
glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->edge_static_capacity,
nullptr, GL_STATIC_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes,
r->edge_static_staging);
}
r->cached_edges_ptr = (const void*)graph.edges;
r->cached_edge_count = graph.edge_count;
r->cached_edges_drawn = (int)out;
r->edges_uploaded = (out > 0);
}
if (r->edges_uploaded) {
glUseProgram(r->edge_shader);
glUniform2f(glGetUniformLocation(r->edge_shader, "u_viewport"),
(float)r->width, (float)r->height);
glUniform1f(glGetUniformLocation(r->edge_shader, "u_scale"), scale);
glUniform2f(glGetUniformLocation(r->edge_shader, "u_translate"), tx, ty);
glUniform2f(r->edge_u_viewport_loc, (float)r->width, (float)r->height);
glUniform1f(r->edge_u_scale_loc, scale);
glUniform2f(r->edge_u_translate_loc, tx, ty);
glUniform1f(r->edge_u_alpha_loc, r->config.edge_alpha);
// Bind TBO al sampler u_node_pos en la texture unit 0.
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_BUFFER, r->node_pos_tex);
glUniform1i(r->edge_u_node_pos_loc, 0);
glLineWidth(r->config.edge_width);
glBindVertexArray(r->edge_vao);
glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
// Orphan: descarta el contenido previo y damos al driver un
// buffer fresco con la capacidad reservada. Despues subimos
// solo los bytes realmente usados con BufferSubData — evitamos
// el sync stall y reutilizamos la asignacion entre frames
// mientras no crezca.
glBufferData(GL_ARRAY_BUFFER, (GLsizeiptr)r->edge_vbo_capacity, nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, (GLsizeiptr)used_bytes, r->edge_staging);
glDrawArrays(GL_LINES, 0, (GLsizei)out);
// Una "instancia" = 1 linea (2 vertices). gl_VertexID dentro
// de la instancia es 0 o 1 → elige endpoint source o target.
glDrawArraysInstanced(GL_LINES, 0, 2, (GLsizei)r->cached_edges_drawn);
glBindVertexArray(0);
glBindTexture(GL_TEXTURE_BUFFER, 0);
}
} else if (graph.edge_count == 0) {
// Si el caller borra todas las aristas, invalidamos el cache para
// que el siguiente upload reconstruya el buffer.
r->edges_uploaded = false;
}
// ----------------------------------------------------------------