fn_registry/python/functions/pipelines/render_automatic_eda_test.py

"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX.

Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon
+ varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX
con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan
POBLADOS (sin la nota de degradación).
"""

import os
import sys

_HERE = os.path.dirname(os.path.abspath(__file__))
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", ".."))  # python/functions
if _FUNCTIONS not in sys.path:
    sys.path.insert(0, _FUNCTIONS)

import duckdb  # noqa: E402

from pipelines.render_automatic_eda import render_automatic_eda  # noqa: E402


def _make_db(path):
    con = duckdb.connect(path)
    con.execute(
        "CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, "
        "amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)"
    )
    from datetime import date, timedelta

    regions = ["norte", "sur", "este"]
    channels = ["web", "tienda"]
    centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)}
    rows = []
    d0 = date(2024, 1, 1)
    for i in range(180):
        r = regions[i % 3]
        ch = channels[i % 2]
        clat, clon = centers[r]
        rows.append((
            d0 + timedelta(days=i), r, ch,
            round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2),
            10 + (i % 5),
            round(clat + (i % 3) * 0.1, 4),
            round(clon + (i % 4) * 0.1, 4),
        ))
    con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows)
    con.close()


def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path):
    db = str(tmp_path / "sales.duckdb")
    _make_db(db)
    out = str(tmp_path / "out")

    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
                             run_llm=False, out_dir=out, basename="test_sales")
    assert r["status"] == "ok", r.get("error")

    # Both formats produced.
    assert r["pdf_path"] and os.path.exists(r["pdf_path"])
    assert r["pptx_path"] and os.path.exists(r["pptx_path"])
    assert (r["n_pages"] or 0) > 0
    assert (r["n_slides"] or 0) > 0
    # Per-chapter manifest written next to the output.
    assert r["manifest_path"] and os.path.exists(r["manifest_path"])


def test_pipeline_chapters_populated_not_degraded(tmp_path):
    """The 4 ctx-dependent chapters build with real data (no degradation note)."""
    import json

    db = str(tmp_path / "sales.duckdb")
    _make_db(db)
    out = str(tmp_path / "out")
    r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
                             run_llm=False, out_dir=out, basename="t2")
    assert r["status"] == "ok"

    # The manifest lists the ctx-dependent chapters as actually rendered.
    with open(r["manifest_path"], encoding="utf-8") as fh:
        man = json.load(fh)
    chapters = man.get("chapters") or {}
    for cid in ("modelos", "timeseries", "geospatial", "agregacion"):
        assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}"


def test_pipeline_bad_db_degrades_without_raising(tmp_path):
    r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost",
                             out_dir=str(tmp_path / "o"))
    assert r["status"] == "error"
    assert "error" in r