fn_registry/python/functions/pipelines/render_automatic_eda_test.py

"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX.

Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon
+ varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX
con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan
POBLADOS (sin la nota de degradación).
"""

import os
import sys

_HERE = os.path.dirname(os.path.abspath(__file__))
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", ".."))  # python/functions
if _FUNCTIONS not in sys.path:
    sys.path.insert(0, _FUNCTIONS)

import duckdb  # noqa: E402

from pipelines.render_automatic_eda import render_automatic_eda  # noqa: E402


def _make_db(path):
    con = duckdb.connect(path)
    con.execute(
        "CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, "
        "amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)"
    )
    from datetime import date, timedelta

    regions = ["norte", "sur", "este"]
    channels = ["web", "tienda"]
    centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)}
    rows = []
    d0 = date(2024, 1, 1)
    for i in range(180):
        r = regions[i % 3]
        ch = channels[i % 2]
        clat, clon = centers[r]
        rows.append((
            d0 + timedelta(days=i), r, ch,
            round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2),
            10 + (i % 5),
            round(clat + (i % 3) * 0.1, 4),
            round(clon + (i % 4) * 0.1, 4),
        ))
    con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows)
    con.close()


def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path):
    db = str(tmp_path / "sales.duckdb")
    _make_db(db)
    out = str(tmp_path / "out")

    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
                             run_llm=False, out_dir=out, basename="test_sales")
    assert r["status"] == "ok", r.get("error")

    # Both formats produced.
    assert r["pdf_path"] and os.path.exists(r["pdf_path"])
    assert r["pptx_path"] and os.path.exists(r["pptx_path"])
    assert (r["n_pages"] or 0) > 0
    assert (r["n_slides"] or 0) > 0
    # Per-chapter manifest written next to the output.
    assert r["manifest_path"] and os.path.exists(r["manifest_path"])


def test_pipeline_chapters_populated_not_degraded(tmp_path):
    """The 4 ctx-dependent chapters build with real data (no degradation note)."""
    import json

    db = str(tmp_path / "sales.duckdb")
    _make_db(db)
    out = str(tmp_path / "out")
    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
                             run_llm=False, out_dir=out, basename="t2")
    assert r["status"] == "ok"

    # The manifest lists the ctx-dependent chapters as actually rendered.
    with open(r["manifest_path"], encoding="utf-8") as fh:
        man = json.load(fh)
    chapters = man.get("chapters") or {}
    for cid in ("modelos", "timeseries", "geospatial", "agregacion"):
        assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}"


def test_pipeline_bad_db_degrades_without_raising(tmp_path):
    r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost",
                             out_dir=str(tmp_path / "o"))
    assert r["status"] == "error"
    assert "error" in r


# --------------------------------------------------------------------------- #
# profile_level: preset de bajo consumo CPU/LLM.
# --------------------------------------------------------------------------- #
def _make_db_models(path):
    """DB con >=2 numéricas continuas (alta cardinalidad, 3 clusters gaussianos).

    El DB `sales` de _make_db solo deja UNA columna de modelo tras la selección de
    features (units es baja cardinalidad, lat/lon discretizadas), insuficiente para
    PCA/KMeans/IsolationForest (necesitan >=2). Este DB sí tiene 3 numéricas
    continuas con estructura de clusters para que el modo completo ejecute los
    multivariantes.
    """
    import random
    from datetime import date, timedelta

    con = duckdb.connect(path)
    con.execute(
        "CREATE TABLE pts (d DATE, grp VARCHAR, x1 DOUBLE, x2 DOUBLE, x3 DOUBLE)"
    )
    random.seed(42)
    centers = [(0.0, 0.0, 0.0), (10.0, 10.0, 10.0), (20.0, 5.0, 15.0)]
    d0 = date(2024, 1, 1)
    rows = []
    for i in range(150):
        cx, cy, cz = centers[i % 3]
        rows.append((
            d0 + timedelta(days=i), f"g{i % 3}",
            round(cx + random.gauss(0, 1.0), 4),
            round(cy + random.gauss(0, 1.0), 4),
            round(cz + random.gauss(0, 1.0), 4),
        ))
    con.executemany("INSERT INTO pts VALUES (?,?,?,?,?)", rows)
    con.close()


def test_profile_level_lite_skips_expensive_models(tmp_path):
    """lite: el bloque models trae PCA + normalidad pero NO KMeans/IsolationForest.

    Demuestra (DoD bajo consumo) que el camino lite no ejecuta los modelos caros
    en CPU ni la capa LLM ni la serie temporal: prof['models'] queda con pca y
    normality poblados y kmeans/outliers a None, prof['llm'] y prof['series'] a
    None, y el capítulo `modelos` se renderiza igualmente (con PCA, sin clusters).
    """
    import json

    db = str(tmp_path / "pts.duckdb")
    _make_db_models(db)
    out = str(tmp_path / "out")
    r = render_automatic_eda(db, "pts", profile_level="lite",
                             out_dir=out, basename="lite")
    assert r["status"] == "ok", r.get("error")

    models = (r["profile"] or {}).get("models") or {}
    assert models.get("pca") is not None, "lite debe traer PCA"
    assert models.get("normality") is not None, "lite debe traer normalidad"
    assert models.get("kmeans") is None, "lite NO debe ejecutar KMeans"
    assert models.get("outliers") is None, "lite NO debe ejecutar IsolationForest"
    assert (r["profile"] or {}).get("llm") is None, "lite NO debe llamar al LLM"
    assert (r["profile"] or {}).get("series") is None, "lite NO debe calcular serie"

    # El capítulo modelos sigue presente (lo puebla el PCA), sin clusters KMeans.
    with open(r["manifest_path"], encoding="utf-8") as fh:
        man = json.load(fh)
    assert "modelos" in (man.get("chapters") or {})


def test_profile_level_standard_runs_full_models(tmp_path):
    """standard (default): modelos completos (KMeans + IsolationForest) y serie."""
    db = str(tmp_path / "pts.duckdb")
    _make_db_models(db)
    out = str(tmp_path / "out")
    r = render_automatic_eda(db, "pts", profile_level="standard",
                             out_dir=out, basename="std")
    assert r["status"] == "ok", r.get("error")
    models = (r["profile"] or {}).get("models") or {}
    assert models.get("pca") is not None
    assert models.get("kmeans") is not None, "standard debe ejecutar KMeans"
    assert models.get("outliers") is not None, "standard debe ejecutar IsolationForest"
    assert (r["profile"] or {}).get("series") is not None, "standard calcula serie"


def _patch_pipeline_internals(monkeypatch, captured):
    """Stub de las dependencias del pipeline para tests de resolución de flags.

    Sustituye profile_table / build_eda_render_ctx / renderers por stubs rápidos
    sin red ni matplotlib, capturando los kwargs con los que se invocan. Permite
    verificar la PRECEDENCIA flag-explícito-sobre-preset sin ejecutar el EDA real.
    """
    import pipelines.render_automatic_eda as mod

    def fake_profile_table(db_path, table, **kw):
        captured["run_llm"] = kw.get("run_llm")
        captured["run_models"] = kw.get("run_models")
        captured["run_series"] = kw.get("run_series")
        captured["sample"] = kw.get("sample")
        return {"status": "ok", "profile": {"columns": []}}

    def fake_ctx(db_path, table, prof, **kw):
        captured["base_ctx"] = kw.get("base_ctx")
        return {}

    monkeypatch.setattr(mod, "profile_table", fake_profile_table)
    monkeypatch.setattr(mod, "build_eda_render_ctx", fake_ctx)
    monkeypatch.setattr(mod, "render_automatic_eda_pdf",
                        lambda *a, **k: {"path": "x.pdf", "n_pages": 1,
                                         "manifest_path": "m.json"})
    monkeypatch.setattr(mod, "render_automatic_eda_pptx",
                        lambda *a, **k: {"path": "x.pptx", "n_slides": 1})


def test_explicit_flag_overrides_preset(monkeypatch):
    """Precedencia: profile_level='lite' con run_llm=True explícito → LLM activo.

    El flag explícito del caller gana al default del preset. Se verifica tanto en
    el flag que llega a profile_table (run_llm=True ⇒ profile_table llamará al
    LLM) como en el base_ctx (run_cluster_llm=True ⇒ narrativa LLM por capítulo).
    """
    captured = {}
    _patch_pipeline_internals(monkeypatch, captured)

    captured.clear()
    render_automatic_eda("db", "t", profile_level="lite", run_llm=True)
    assert captured["run_llm"] is True, "flag explícito debe primar sobre preset lite"
    assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True


def test_full_preset_enables_llm(monkeypatch):
    """full: el preset resuelve run_llm=True y activa la narrativa LLM en el ctx."""
    captured = {}
    _patch_pipeline_internals(monkeypatch, captured)

    captured.clear()
    render_automatic_eda("db", "t", profile_level="full")
    assert captured["run_llm"] is True
    assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True


def test_no_profile_level_defaults_to_standard(monkeypatch):
    """Retro-compat: sin profile_level ni flags, el comportamiento es el histórico.

    standard = run_models True, run_series True, run_llm False, sample 5000. Es el
    mismo default que tenía el pipeline antes de introducir profile_level (cambio
    aditivo: las llamadas existentes no cambian de comportamiento).
    """
    captured = {}
    _patch_pipeline_internals(monkeypatch, captured)

    captured.clear()
    render_automatic_eda("db", "t")  # sin profile_level ni flags de coste
    assert captured["run_models"] is True
    assert captured["run_series"] is True
    assert captured["run_llm"] is False
    assert captured["sample"] == 5000


def test_lite_preset_defaults(monkeypatch):
    """lite por defecto: run_llm/run_series False y sample reducido a 2000."""
    captured = {}
    _patch_pipeline_internals(monkeypatch, captured)

    captured.clear()
    render_automatic_eda("db", "t", profile_level="lite")
    assert captured["run_llm"] is False
    assert captured["run_series"] is False
    assert captured["sample"] == 2000