feat(eda): wiring AutomaticEDA — build_eda_render_ctx + pipeline render_automatic_eda + profile_table(emit_automatic)

Conecta el motor AutomaticEDA con los datos crudos para que los 4 capítulos
dependientes de ctx (modelos, timeseries, geospatial, agregacion) salgan
POBLADOS en vez de degradar a una nota.

- build_eda_render_ctx (datascience, impure, dict-no-throw): dado db_path+table
  y el TableProfile agregado, construye el ctx con los datos crudos que el
  perfil no incluye: raw_numeric {col:[float|None]} alineado por fila (modelos /
  geospatial), timeseries_raw {time_col,t,series} vía extract_timeseries_raw,
  geo_points {lats,lons} desde el par lat/lon detectado, y db_path/table para el
  groupby/pivot push-down de agregacion. Muestrea con LIMIT (no trae la tabla
  entera a RAM). Compone detect_time_column / extract_timeseries_raw /
  detect_latlon_columns / duckdb_query_readonly (imports lazy para evitar ciclo).
- render_automatic_eda (pipeline): one-shot perfil -> ctx -> PDF + PPTX con los
  11 capítulos poblados; devuelve rutas + manifest de versiones por capítulo.
- profile_table: flag aditivo emit_automatic=True emite el AutomaticEDA PDF+PPTX
  además del flujo legacy (emit_pdf/render_eda_pdf intacto). Nuevas claves de
  retorno aeda_pdf_path / aeda_pptx_path / aeda_manifest_path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-30 16:08:41 +02:00
parent f5b30b23dc
commit f3d427d9e4
9 changed files with 867 additions and 2 deletions
@@ -0,0 +1,91 @@
"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX.
Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon
+ varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX
con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan
POBLADOS (sin la nota de degradación).
"""
import os
import sys
_HERE = os.path.dirname(os.path.abspath(__file__))
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..")) # python/functions
if _FUNCTIONS not in sys.path:
sys.path.insert(0, _FUNCTIONS)
import duckdb # noqa: E402
from pipelines.render_automatic_eda import render_automatic_eda # noqa: E402
def _make_db(path):
con = duckdb.connect(path)
con.execute(
"CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, "
"amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)"
)
from datetime import date, timedelta
regions = ["norte", "sur", "este"]
channels = ["web", "tienda"]
centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)}
rows = []
d0 = date(2024, 1, 1)
for i in range(180):
r = regions[i % 3]
ch = channels[i % 2]
clat, clon = centers[r]
rows.append((
d0 + timedelta(days=i), r, ch,
round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2),
10 + (i % 5),
round(clat + (i % 3) * 0.1, 4),
round(clon + (i % 4) * 0.1, 4),
))
con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows)
con.close()
def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path):
db = str(tmp_path / "sales.duckdb")
_make_db(db)
out = str(tmp_path / "out")
r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
run_llm=False, out_dir=out, basename="test_sales")
assert r["status"] == "ok", r.get("error")
# Both formats produced.
assert r["pdf_path"] and os.path.exists(r["pdf_path"])
assert r["pptx_path"] and os.path.exists(r["pptx_path"])
assert (r["n_pages"] or 0) > 0
assert (r["n_slides"] or 0) > 0
# Per-chapter manifest written next to the output.
assert r["manifest_path"] and os.path.exists(r["manifest_path"])
def test_pipeline_chapters_populated_not_degraded(tmp_path):
"""The 4 ctx-dependent chapters build with real data (no degradation note)."""
import json
db = str(tmp_path / "sales.duckdb")
_make_db(db)
out = str(tmp_path / "out")
r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
run_llm=False, out_dir=out, basename="t2")
assert r["status"] == "ok"
# The manifest lists the ctx-dependent chapters as actually rendered.
with open(r["manifest_path"], encoding="utf-8") as fh:
man = json.load(fh)
chapters = man.get("chapters") or {}
for cid in ("modelos", "timeseries", "geospatial", "agregacion"):
assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}"
def test_pipeline_bad_db_degrades_without_raising(tmp_path):
r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost",
out_dir=str(tmp_path / "o"))
assert r["status"] == "error"
assert "error" in r