diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6d6770c8..39aec8d9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -138,6 +138,18 @@ if(TRACY_ENABLE)
     target_link_libraries(fn_framework PUBLIC tracy)
 endif()
 
+# --- OpenMP (opcional) ---
+# Habilita #pragma omp en las funciones del registry que lo declaren bajo
+# guardia _OPENMP. Si el compilador no lo soporta (no debiera, gcc/clang
+# y mingw-w64 lo traen), los pragmas se ignoran sin romper el build.
+find_package(OpenMP QUIET)
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(fn_framework PUBLIC OpenMP::OpenMP_CXX)
+    message(STATUS "OpenMP enabled for fn_framework (${OpenMP_CXX_VERSION})")
+else()
+    message(STATUS "OpenMP NOT found — force layout fallback to single-thread")
+endif()
+
 # --- Macro for creating ImGui apps ---
 # Capturamos la raiz del modulo cpp/ para que add_imgui_app la use desde
 # subdirectorios (donde CMAKE_CURRENT_SOURCE_DIR apunta al app, no al root).
diff --git a/cpp/apps/primitives_gallery/demos_graph.cpp b/cpp/apps/primitives_gallery/demos_graph.cpp
index 2311d431..196e4ca6 100644
--- a/cpp/apps/primitives_gallery/demos_graph.cpp
+++ b/cpp/apps/primitives_gallery/demos_graph.cpp
@@ -173,7 +173,14 @@ void demo_graph() {
 
     section("Viewport (drag = pan, wheel = zoom, click = select)");
     if (s_initialized) {
-        // Avanzamos 1 paso de force layout cada frame mientras layout_running
+        // Avanzamos 1 paso de force layout cada frame mientras layout_running.
+        // Auto-pause: si la energia por nodo cae bajo el umbral durante N
+        // frames consecutivos, paramos la simulacion automaticamente — el
+        // grafo ya esta estable. El usuario lo retoma con "Resume layout"
+        // o "Regenerate".
+        static int s_low_energy_frames = 0;
+        const int   k_pause_after_frames = 30;
+        const float k_pause_per_node     = 0.001f; // umbral de energia/nodo
         if (s_state.layout_running) {
             ForceLayoutConfig cfg;
             cfg.repulsion  = s_repulsion;
@@ -181,6 +188,20 @@ void demo_graph() {
             cfg.gravity    = s_gravity;
             cfg.iterations = 1;
             s_state.layout_energy = graph_force_layout_step(s_graph, cfg);
+
+            const float per_node = s_graph.node_count > 0
+                ? s_state.layout_energy / (float)s_graph.node_count
+                : 0.0f;
+            if (per_node < k_pause_per_node) {
+                if (++s_low_energy_frames >= k_pause_after_frames) {
+                    s_state.layout_running = false;
+                    s_low_energy_frames = 0;
+                }
+            } else {
+                s_low_energy_frames = 0;
+            }
+        } else {
+            s_low_energy_frames = 0;
         }
         graph_viewport("##graph_demo", s_graph, s_state, ImVec2(0, 460));
     }
diff --git a/cpp/functions/viz/graph_force_layout.cpp b/cpp/functions/viz/graph_force_layout.cpp
index 52b39752..f7a99289 100644
--- a/cpp/functions/viz/graph_force_layout.cpp
+++ b/cpp/functions/viz/graph_force_layout.cpp
@@ -146,8 +146,11 @@ static void quad_insert_body(int qi, int node_idx) {
 static void quad_force(int qi, float nx, float ny,
                        float theta, float repulsion, float min_dist,
                        float& fx, float& fy) {
-    // Iterative traversal using a small stack to avoid recursion depth issues.
-    static int stack[MAX_QUAD_NODES]; // reuse static stack
+    // Stack en pila de la funcion: thread-safe (la version anterior con
+    // `static` se rompia bajo OpenMP). La profundidad de un quadtree con N
+    // bodies acotada por log4(N) ~= 10 niveles para N <= 1M, asi que 256
+    // entradas son holgadas para todos los pushes simultaneos.
+    int stack[256];
     int top = 0;
     stack[top++] = qi;
 
@@ -207,6 +210,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config)
 
     for (int iter = 0; iter < config.iterations; ++iter) {
         // Zero forces
+        #pragma omp parallel for if(graph.node_count >= 1024) schedule(static)
         for (int i = 0; i < graph.node_count; ++i) {
             fx_buf[i] = 0.0f;
             fy_buf[i] = 0.0f;
@@ -240,14 +244,16 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config)
         }
 
         // ---- Repulsion via Barnes-Hut ----
+        // Cada iteracion lee del quadtree (read-only) y escribe en su propio
+        // slot de fx_buf/fy_buf — embarrassingly parallel. quad_force usa
+        // stack local en pila, asi que es thread-safe.
+        #pragma omp parallel for if(graph.node_count >= 1024) schedule(dynamic, 256)
         for (int i = 0; i < graph.node_count; ++i) {
             if (graph.nodes[i].pinned) continue;
             quad_force(root,
                        graph.nodes[i].x, graph.nodes[i].y,
                        config.theta, config.repulsion, config.min_distance,
                        fx_buf[i], fy_buf[i]);
-            // Subtract self-interaction (the tree includes the node itself)
-            // Self-force: repulsion * 1 / min_dist^2, but direction is (0,0) -> skip
         }
 
         // ---- Attraction along edges (spring force) ----
@@ -274,6 +280,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config)
 
         // ---- Gravity toward center (0,0) ----
         if (config.gravity != 0.0f) {
+            #pragma omp parallel for if(graph.node_count >= 1024) schedule(static)
             for (int i = 0; i < graph.node_count; ++i) {
                 if (graph.nodes[i].pinned) continue;
                 fx_buf[i] -= config.gravity * graph.nodes[i].x;
@@ -283,6 +290,7 @@ float graph_force_layout_step(GraphData& graph, const ForceLayoutConfig& config)
 
         // ---- Integrate: v = v * damping + F; pos += v ----
         total_energy = 0.0f;
+        #pragma omp parallel for if(graph.node_count >= 1024) schedule(static) reduction(+:total_energy)
         for (int i = 0; i < graph.node_count; ++i) {
             GraphNode& n = graph.nodes[i];
             if (n.pinned) continue;
diff --git a/cpp/functions/viz/graph_renderer.cpp b/cpp/functions/viz/graph_renderer.cpp
index 67e9bcf5..8bc00b4a 100644
--- a/cpp/functions/viz/graph_renderer.cpp
+++ b/cpp/functions/viz/graph_renderer.cpp
@@ -389,6 +389,11 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph,
 
         glBindVertexArray(r->edge_vao);
         glBindBuffer(GL_ARRAY_BUFFER, r->edge_vbo);
+        // Orphan: descarta el buffer anterior antes de subir el nuevo. Evita
+        // que el driver bloquee esperando que termine el frame previo (sync
+        // stall) y nos da un VBO fresco. Coste: ~0; ganancia: 2-3x upload
+        // throughput en drivers que respetan el hint (Mesa, NVIDIA, AMD).
+        glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), nullptr, GL_DYNAMIC_DRAW);
         glBufferData(GL_ARRAY_BUFFER, vi * (int)sizeof(float), edge_buf, GL_DYNAMIC_DRAW);
         glDrawArrays(GL_LINES, 0, vi / 6);
         glBindVertexArray(0);
@@ -422,7 +427,10 @@ unsigned int graph_renderer_draw(GraphRenderer* r, const GraphData& graph,
 
         glBindVertexArray(r->node_vao);
         glBindBuffer(GL_ARRAY_BUFFER, r->node_instance_vbo);
-        glBufferData(GL_ARRAY_BUFFER, graph.node_count * 7 * (int)sizeof(float), node_buf, GL_DYNAMIC_DRAW);
+        // Orphan + reupload (ver comentario en edge upload arriba).
+        const GLsizeiptr node_bytes = graph.node_count * 7 * (GLsizeiptr)sizeof(float);
+        glBufferData(GL_ARRAY_BUFFER, node_bytes, nullptr,  GL_DYNAMIC_DRAW);
+        glBufferData(GL_ARRAY_BUFFER, node_bytes, node_buf, GL_DYNAMIC_DRAW);
 
         // Draw 4 vertices (triangle strip quad) x node_count instances
         // Pass per-instance node_px uniform via the average size (approximation)