diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d6770c8..39aec8d9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -138,6 +138,18 @@ if(TRACY_ENABLE) target_link_libraries(fn_framework PUBLIC tracy) endif() +# --- OpenMP (opcional) --- +# Habilita #pragma omp en las funciones del registry que lo declaren bajo +# guardia _OPENMP. Si el compilador no lo soporta (no debiera, gcc/clang +# y mingw-w64 lo traen), los pragmas se ignoran sin romper el build. +find_package(OpenMP QUIET) +if(OpenMP_CXX_FOUND) + target_link_libraries(fn_framework PUBLIC OpenMP::OpenMP_CXX) + message(STATUS "OpenMP enabled for fn_framework (${OpenMP_CXX_VERSION})") +else() + message(STATUS "OpenMP NOT found — force layout fallback to single-thread") +endif() + # --- Macro for creating ImGui apps --- # Capturamos la raiz del modulo cpp/ para que add_imgui_app la use desde # subdirectorios (donde CMAKE_CURRENT_SOURCE_DIR apunta al app, no al root). diff --git a/cpp/apps/primitives_gallery/demos_graph.cpp b/cpp/apps/primitives_gallery/demos_graph.cpp index 2311d431..196e4ca6 100644 --- a/cpp/apps/primitives_gallery/demos_graph.cpp +++ b/cpp/apps/primitives_gallery/demos_graph.cpp @@ -173,7 +173,14 @@ void demo_graph() { section("Viewport (drag = pan, wheel = zoom, click = select)"); if (s_initialized) { - // Avanzamos 1 paso de force layout cada frame mientras layout_running + // Avanzamos 1 paso de force layout cada frame mientras layout_running. + // Auto-pause: si la energia por nodo cae bajo el umbral durante N + // frames consecutivos, paramos la simulacion automaticamente — el + // grafo ya esta estable. El usuario lo retoma con "Resume layout" + // o "Regenerate". + static int s_low_energy_frames = 0; + const int k_pause_after_frames = 30; + const float k_pause_per_node = 0.001f; // umbral de energia/nodo if (s_state.layout_running) { ForceLayoutConfig cfg; cfg.repulsion = s_repulsion; @@ -181,6 +188,20 @@ void demo_graph() { cfg.gravity = s_gravity; cfg.iterations = 1; s_state.layout_energy = graph_force_layout_step(s_graph, cfg); + + const float per_node = s_graph.node_count > 0 + ? s_state.layout_energy / (float)s_graph.node_count + : 0.0f; + if (per_node < k_pause_per_node) { + if (++s_low_energy_frames >= k_pause_after_frames) { + s_state.layout_running = false; + s_low_energy_frames = 0; + } + } else { + s_low_energy_frames = 0; + } + } else { + s_low_energy_frames = 0; } graph_viewport("##graph_demo", s_graph, s_state, ImVec2(0, 460)); } diff --git a/cpp/functions/viz/graph_force_layout.cpp b/cpp/functions/viz/graph_force_layout.cpp index 52b39752..f7a99289 100644 --- a/cpp/functions/viz/graph_force_layout.cpp +++ b/cpp/functions/viz/graph_force_layout.cpp @@ -146,8 +146,11 @@ static void quad_insert_body(int qi, int node_idx) { static void quad_force(int qi, float nx, float ny, float theta, float repulsion, float min_dist, float& fx, float& fy) { - // Iterative traversal using a small stack to avoid recursion depth issues. - static int stack[MAX_QUAD_NODES]; // reuse static stack + // Stack en pila de la funcion: thread-safe (la version anterior con + // `static` se rompia bajo OpenMP). La profundidad de un quadtree con N + // bodies acotada por log4(N) ~= 10 niveles para N <= 1M, asi que 256 + // entradas son holgadas para todos los pushes simultaneos. + int stack[256]; int top = 0; stack[top++] = qi; @@ -207,6 +210,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config) for (int iter = 0; iter < config.iterations; ++iter) { // Zero forces + #pragma omp parallel for if(graph.node_count >= 1024) schedule(static) for (int i = 0; i < graph.node_count; ++i) { fx_buf[i] = 0.0f; fy_buf[i] = 0.0f; @@ -240,14 +244,16 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config) } // ---- Repulsion via Barnes-Hut ---- + // Cada iteracion lee del quadtree (read-only) y escribe en su propio + // slot de fx_buf/fy_buf — embarrassingly parallel. quad_force usa + // stack local en pila, asi que es thread-safe. + #pragma omp parallel for if(graph.node_count >= 1024) schedule(dynamic, 256) for (int i = 0; i < graph.node_count; ++i) { if (graph.nodes[i].pinned) continue; quad_force(root, graph.nodes[i].x, graph.nodes[i].y, config.theta, config.repulsion, config.min_distance, fx_buf[i], fy_buf[i]); - // Subtract self-interaction (the tree includes the node itself) - // Self-force: repulsion * 1 / min_dist^2, but direction is (0,0) -> skip } // ---- Attraction along edges (spring force) ---- @@ -274,6 +280,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config) // ---- Gravity toward center (0,0) ---- if (config.gravity != 0.0f) { + #pragma omp parallel for if(graph.node_count >= 1024) schedule(static) for (int i = 0; i < graph.node_count; ++i) { if (graph.nodes[i].pinned) continue; fx_buf[i] -= config.gravity * graph.nodes[i].x; @@ -283,6 +290,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config) // ---- Integrate: v = v * damping + F; pos += v ---- total_energy = 0.0f; + #pragma omp parallel for if(graph.node_count >= 1024) schedule(static) reduction(+:total_energy) for (int i = 0; i < graph.node_count; ++i) { GraphNode& n = graph.nodes[i]; if (n.pinned) continue; diff --git a/cpp/functions/viz/graph_renderer.cpp b/cpp/functions/viz/graph_renderer.cpp index 67e9bcf5..8bc00b4a 100644 --- a/cpp/functions/viz/graph_renderer.cpp +++ b/cpp/functions/viz/graph_renderer.cpp @@ -389,6 +389,11 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph, glBindVertexArray(r->edge_vao); glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo); + // Orphan: descarta el buffer anterior antes de subir el nuevo. Evita + // que el driver bloquee esperando que termine el frame previo (sync + // stall) y nos da un VBO fresco. Coste: ~0; ganancia: 2-3x upload + // throughput en drivers que respetan el hint (Mesa, NVIDIA, AMD). + glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), nullptr, GL_DYNAMIC_DRAW); glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), edge_buf, GL_DYNAMIC_DRAW); glDrawArrays(GL_LINES, 0, vi / 6); glBindVertexArray(0); @@ -422,7 +427,10 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph, glBindVertexArray(r->node_vao); glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo); - glBufferData(GL_ARRAY_BUFFER, graph.node_count * 7 * (int)sizeof(float), node_buf, GL_DYNAMIC_DRAW); + // Orphan + reupload (ver comentario en edge upload arriba). + const GLsizeiptr node_bytes = graph.node_count * 7 * (GLsizeiptr)sizeof(float); + glBufferData(GL_ARRAY_BUFFER, node_bytes, nullptr, GL_DYNAMIC_DRAW); + glBufferData(GL_ARRAY_BUFFER, node_bytes, node_buf, GL_DYNAMIC_DRAW); // Draw 4 vertices (triangle strip quad) x node_count instances // Pass per-instance node_px uniform via the average size (approximation)