"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX. Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon + varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan POBLADOS (sin la nota de degradación). """ import os import sys _HERE = os.path.dirname(os.path.abspath(__file__)) _FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..")) # python/functions if _FUNCTIONS not in sys.path: sys.path.insert(0, _FUNCTIONS) import duckdb # noqa: E402 from pipelines.render_automatic_eda import render_automatic_eda # noqa: E402 def _make_db(path): con = duckdb.connect(path) con.execute( "CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, " "amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)" ) from datetime import date, timedelta regions = ["norte", "sur", "este"] channels = ["web", "tienda"] centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)} rows = [] d0 = date(2024, 1, 1) for i in range(180): r = regions[i % 3] ch = channels[i % 2] clat, clon = centers[r] rows.append(( d0 + timedelta(days=i), r, ch, round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2), 10 + (i % 5), round(clat + (i % 3) * 0.1, 4), round(clon + (i % 4) * 0.1, 4), )) con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows) con.close() def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path): db = str(tmp_path / "sales.duckdb") _make_db(db) out = str(tmp_path / "out") r = render_automatic_eda(db, "sales", run_models=True, run_series=True, run_llm=False, out_dir=out, basename="test_sales") assert r["status"] == "ok", r.get("error") # Both formats produced. assert r["pdf_path"] and os.path.exists(r["pdf_path"]) assert r["pptx_path"] and os.path.exists(r["pptx_path"]) assert (r["n_pages"] or 0) > 0 assert (r["n_slides"] or 0) > 0 # Per-chapter manifest written next to the output. assert r["manifest_path"] and os.path.exists(r["manifest_path"]) def test_pipeline_chapters_populated_not_degraded(tmp_path): """The 4 ctx-dependent chapters build with real data (no degradation note).""" import json db = str(tmp_path / "sales.duckdb") _make_db(db) out = str(tmp_path / "out") r = render_automatic_eda(db, "sales", run_models=True, run_series=True, run_llm=False, out_dir=out, basename="t2") assert r["status"] == "ok" # The manifest lists the ctx-dependent chapters as actually rendered. with open(r["manifest_path"], encoding="utf-8") as fh: man = json.load(fh) chapters = man.get("chapters") or {} for cid in ("modelos", "timeseries", "geospatial", "agregacion"): assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}" def test_pipeline_bad_db_degrades_without_raising(tmp_path): r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost", out_dir=str(tmp_path / "o")) assert r["status"] == "error" assert "error" in r # --------------------------------------------------------------------------- # # profile_level: preset de bajo consumo CPU/LLM. # --------------------------------------------------------------------------- # def _make_db_models(path): """DB con >=2 numéricas continuas (alta cardinalidad, 3 clusters gaussianos). El DB `sales` de _make_db solo deja UNA columna de modelo tras la selección de features (units es baja cardinalidad, lat/lon discretizadas), insuficiente para PCA/KMeans/IsolationForest (necesitan >=2). Este DB sí tiene 3 numéricas continuas con estructura de clusters para que el modo completo ejecute los multivariantes. """ import random from datetime import date, timedelta con = duckdb.connect(path) con.execute( "CREATE TABLE pts (d DATE, grp VARCHAR, x1 DOUBLE, x2 DOUBLE, x3 DOUBLE)" ) random.seed(42) centers = [(0.0, 0.0, 0.0), (10.0, 10.0, 10.0), (20.0, 5.0, 15.0)] d0 = date(2024, 1, 1) rows = [] for i in range(150): cx, cy, cz = centers[i % 3] rows.append(( d0 + timedelta(days=i), f"g{i % 3}", round(cx + random.gauss(0, 1.0), 4), round(cy + random.gauss(0, 1.0), 4), round(cz + random.gauss(0, 1.0), 4), )) con.executemany("INSERT INTO pts VALUES (?,?,?,?,?)", rows) con.close() def test_profile_level_lite_skips_expensive_models(tmp_path): """lite: el bloque models trae PCA + normalidad pero NO KMeans/IsolationForest. Demuestra (DoD bajo consumo) que el camino lite no ejecuta los modelos caros en CPU ni la capa LLM ni la serie temporal: prof['models'] queda con pca y normality poblados y kmeans/outliers a None, prof['llm'] y prof['series'] a None, y el capítulo `modelos` se renderiza igualmente (con PCA, sin clusters). """ import json db = str(tmp_path / "pts.duckdb") _make_db_models(db) out = str(tmp_path / "out") r = render_automatic_eda(db, "pts", profile_level="lite", out_dir=out, basename="lite") assert r["status"] == "ok", r.get("error") models = (r["profile"] or {}).get("models") or {} assert models.get("pca") is not None, "lite debe traer PCA" assert models.get("normality") is not None, "lite debe traer normalidad" assert models.get("kmeans") is None, "lite NO debe ejecutar KMeans" assert models.get("outliers") is None, "lite NO debe ejecutar IsolationForest" assert (r["profile"] or {}).get("llm") is None, "lite NO debe llamar al LLM" assert (r["profile"] or {}).get("series") is None, "lite NO debe calcular serie" # El capítulo modelos sigue presente (lo puebla el PCA), sin clusters KMeans. with open(r["manifest_path"], encoding="utf-8") as fh: man = json.load(fh) assert "modelos" in (man.get("chapters") or {}) def test_profile_level_standard_runs_full_models(tmp_path): """standard (default): modelos completos (KMeans + IsolationForest) y serie.""" db = str(tmp_path / "pts.duckdb") _make_db_models(db) out = str(tmp_path / "out") r = render_automatic_eda(db, "pts", profile_level="standard", out_dir=out, basename="std") assert r["status"] == "ok", r.get("error") models = (r["profile"] or {}).get("models") or {} assert models.get("pca") is not None assert models.get("kmeans") is not None, "standard debe ejecutar KMeans" assert models.get("outliers") is not None, "standard debe ejecutar IsolationForest" assert (r["profile"] or {}).get("series") is not None, "standard calcula serie" def _patch_pipeline_internals(monkeypatch, captured): """Stub de las dependencias del pipeline para tests de resolución de flags. Sustituye profile_table / build_eda_render_ctx / renderers por stubs rápidos sin red ni matplotlib, capturando los kwargs con los que se invocan. Permite verificar la PRECEDENCIA flag-explícito-sobre-preset sin ejecutar el EDA real. """ import pipelines.render_automatic_eda as mod def fake_profile_table(db_path, table, **kw): captured["run_llm"] = kw.get("run_llm") captured["run_models"] = kw.get("run_models") captured["run_series"] = kw.get("run_series") captured["sample"] = kw.get("sample") return {"status": "ok", "profile": {"columns": []}} def fake_ctx(db_path, table, prof, **kw): captured["base_ctx"] = kw.get("base_ctx") return {} monkeypatch.setattr(mod, "profile_table", fake_profile_table) monkeypatch.setattr(mod, "build_eda_render_ctx", fake_ctx) monkeypatch.setattr(mod, "render_automatic_eda_pdf", lambda *a, **k: {"path": "x.pdf", "n_pages": 1, "manifest_path": "m.json"}) monkeypatch.setattr(mod, "render_automatic_eda_pptx", lambda *a, **k: {"path": "x.pptx", "n_slides": 1}) def test_explicit_flag_overrides_preset(monkeypatch): """Precedencia: profile_level='lite' con run_llm=True explícito → LLM activo. El flag explícito del caller gana al default del preset. Se verifica tanto en el flag que llega a profile_table (run_llm=True ⇒ profile_table llamará al LLM) como en el base_ctx (run_cluster_llm=True ⇒ narrativa LLM por capítulo). """ captured = {} _patch_pipeline_internals(monkeypatch, captured) captured.clear() render_automatic_eda("db", "t", profile_level="lite", run_llm=True) assert captured["run_llm"] is True, "flag explícito debe primar sobre preset lite" assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True def test_full_preset_enables_llm(monkeypatch): """full: el preset resuelve run_llm=True y activa la narrativa LLM en el ctx.""" captured = {} _patch_pipeline_internals(monkeypatch, captured) captured.clear() render_automatic_eda("db", "t", profile_level="full") assert captured["run_llm"] is True assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True def test_no_profile_level_defaults_to_standard(monkeypatch): """Retro-compat: sin profile_level ni flags, el comportamiento es el histórico. standard = run_models True, run_series True, run_llm False, sample 5000. Es el mismo default que tenía el pipeline antes de introducir profile_level (cambio aditivo: las llamadas existentes no cambian de comportamiento). """ captured = {} _patch_pipeline_internals(monkeypatch, captured) captured.clear() render_automatic_eda("db", "t") # sin profile_level ni flags de coste assert captured["run_models"] is True assert captured["run_series"] is True assert captured["run_llm"] is False assert captured["sample"] == 5000 def test_lite_preset_defaults(monkeypatch): """lite por defecto: run_llm/run_series False y sample reducido a 2000.""" captured = {} _patch_pipeline_internals(monkeypatch, captured) captured.clear() render_automatic_eda("db", "t", profile_level="lite") assert captured["run_llm"] is False assert captured["run_series"] is False assert captured["sample"] == 2000