feat(eda): wiring AutomaticEDA — build_eda_render_ctx + pipeline render_automatic_eda + profile_table(emit_automatic)

Conecta el motor AutomaticEDA con los datos crudos para que los 4 capítulos dependientes de ctx (modelos, timeseries, geospatial, agregacion) salgan POBLADOS en vez de degradar a una nota. - build_eda_render_ctx (datascience, impure, dict-no-throw): dado db_path+table y el TableProfile agregado, construye el ctx con los datos crudos que el perfil no incluye: raw_numeric {col:[float|None]} alineado por fila (modelos / geospatial), timeseries_raw {time_col,t,series} vía extract_timeseries_raw, geo_points {lats,lons} desde el par lat/lon detectado, y db_path/table para el groupby/pivot push-down de agregacion. Muestrea con LIMIT (no trae la tabla entera a RAM). Compone detect_time_column / extract_timeseries_raw / detect_latlon_columns / duckdb_query_readonly (imports lazy para evitar ciclo). - render_automatic_eda (pipeline): one-shot perfil -> ctx -> PDF + PPTX con los 11 capítulos poblados; devuelve rutas + manifest de versiones por capítulo. - profile_table: flag aditivo emit_automatic=True emite el AutomaticEDA PDF+PPTX además del flujo legacy (emit_pdf/render_eda_pdf intacto). Nuevas claves de retorno aeda_pdf_path / aeda_pptx_path / aeda_manifest_path. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 16:08:41 +02:00
parent f5b30b23dc
commit f3d427d9e4
9 changed files with 867 additions and 2 deletions
@@ -0,0 +1,91 @@
+"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX.
+
+Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon
+ varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX
+con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan
+POBLADOS (sin la nota de degradación).
+"""
+
+import os
+import sys
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", ".."))  # python/functions
+if _FUNCTIONS not in sys.path:
+    sys.path.insert(0, _FUNCTIONS)
+
+import duckdb  # noqa: E402
+
+from pipelines.render_automatic_eda import render_automatic_eda  # noqa: E402
+
+
+def _make_db(path):
+    con = duckdb.connect(path)
+    con.execute(
+        "CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, "
+        "amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)"
+    )
+    from datetime import date, timedelta
+
+    regions = ["norte", "sur", "este"]
+    channels = ["web", "tienda"]
+    centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)}
+    rows = []
+    d0 = date(2024, 1, 1)
+    for i in range(180):
+        r = regions[i % 3]
+        ch = channels[i % 2]
+        clat, clon = centers[r]
+        rows.append((
+            d0 + timedelta(days=i), r, ch,
+            round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2),
+            10 + (i % 5),
+            round(clat + (i % 3) * 0.1, 4),
+            round(clon + (i % 4) * 0.1, 4),
+        ))
+    con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows)
+    con.close()
+
+
+def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path):
+    db = str(tmp_path / "sales.duckdb")
+    _make_db(db)
+    out = str(tmp_path / "out")
+
+    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
+                             run_llm=False, out_dir=out, basename="test_sales")
+    assert r["status"] == "ok", r.get("error")
+
+    # Both formats produced.
+    assert r["pdf_path"] and os.path.exists(r["pdf_path"])
+    assert r["pptx_path"] and os.path.exists(r["pptx_path"])
+    assert (r["n_pages"] or 0) > 0
+    assert (r["n_slides"] or 0) > 0
+    # Per-chapter manifest written next to the output.
+    assert r["manifest_path"] and os.path.exists(r["manifest_path"])
+
+
+def test_pipeline_chapters_populated_not_degraded(tmp_path):
+    """The 4 ctx-dependent chapters build with real data (no degradation note)."""
+    import json
+
+    db = str(tmp_path / "sales.duckdb")
+    _make_db(db)
+    out = str(tmp_path / "out")
+    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
+                             run_llm=False, out_dir=out, basename="t2")
+    assert r["status"] == "ok"
+
+    # The manifest lists the ctx-dependent chapters as actually rendered.
+    with open(r["manifest_path"], encoding="utf-8") as fh:
+        man = json.load(fh)
+    chapters = man.get("chapters") or {}
+    for cid in ("modelos", "timeseries", "geospatial", "agregacion"):
+        assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}"
+
+
+def test_pipeline_bad_db_degrades_without_raising(tmp_path):
+    r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost",
+                             out_dir=str(tmp_path / "o"))
+    assert r["status"] == "error"
+    assert "error" in r