test(eda): tests del capítulo ANÁLISIS LLM (golden + edges + anti-cortes)

Suite self-contained (perfil sintético + un golden, sin DuckDB): - golden: build_analisis_llm devuelve el Chapter y el documento entero renderiza a PDF y PPTX con resumen, análisis sugeridos, limpieza y una columna del diccionario presentes. - orden: el capítulo queda inmediatamente después de `overview`. - edges: profile sin bloque `llm` (o None/{}/malformado/llm vacío) -> None sin lanzar; fallback a ctx['llm']. - anti-cortes: diccionario de 40 filas + sugerencia de limpieza de ~150 chars se reparten en varias páginas/slides sin perder ninguna fila ni palabra. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 15:01:26 +02:00
parent fc5bc334c8
commit af1dd9bcc2
1 changed files with 190 additions and 0 deletions
@@ -0,0 +1,190 @@
 """Tests for the ANÁLISIS LLM chapter — DoD: golden + edges + anti-cut.
 Self-contained: builds a synthetic TableProfile carrying an ``llm`` block (the
 shape ``eda_llm_insights`` produces) so the suite is fast and deterministic — no
 DuckDB and no LLM call. Verifies:
 * golden — ``build_analisis_llm`` yields the chapter and the full document
  renders to PDF *and* PPTX with the summary, a suggested analysis, a cleaning
  suggestion and a dictionary column all present;
 * order — the chapter sits immediately after ``overview`` (user requirement);
 * edges — a profile with no ``llm`` block (or None/empty/malformed) returns
  ``None`` and never raises;
 * anti-cut — a long dictionary (40 rows) and a 150-char cleaning suggestion are
  rendered to PDF and PPTX without losing a single row or word.
 """
 import os
 import re
 import tempfile
 from pypdf import PdfReader
 from pptx import Presentation
 from datascience.automatic_eda.chapters.analisis_llm import (
    build_analisis_llm, CHAPTER_VERSION)
 from datascience.automatic_eda.chapters_registry import build_document
 from datascience.automatic_eda.model import Chapter, DataTable
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
 def _profile() -> dict:
    return {
        "table": "ventas",
        "source": "/data/ventas.csv",
        "profiled_at": "2026-06-30T10:00:00+00:00",
        "n_rows": 1000,
        "n_cols": 2,
        "quality_score": 92.5,
        "columns": [
            {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
             "null_count": 0,
             "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0,
                         "max": 100.0, "std": 12.3}},
            {"name": "categoria", "inferred_type": "categorical",
             "null_pct": 0.0, "null_count": 0,
             "categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
        ],
        "llm": {
            "summary": "Tabla de ventas por producto. Token SUMMARYTOKEN.",
            "row_meaning": "Cada fila es una venta. Token ROWTOKEN.",
            "dictionary": [
                {"column": "precio", "description": "Precio unitario DESCTOKEN",
                 "business_meaning": "Ingreso por unidad", "unit": "EUR"},
                {"column": "categoria", "description": "Familia de producto",
                 "business_meaning": "Segmento comercial", "unit": ""},
            ],
            "pii": [{"column": "categoria", "kind": "ninguno", "severity": "low"}],
            "cleaning": ["Quitar nulos de precio CLEANTOKEN",
                         "Normalizar mayusculas en categoria"],
            "analyses": ["Estudiar relacion precio-categoria ANALYSISTOKEN",
                         "Detectar outliers de precio"],
        },
    }
 def _pdf_text(path: str) -> str:
    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
    return re.sub(r"\s+", " ", txt)
 def _pptx_text(path: str) -> str:
    prs = Presentation(path)
    parts = []
    for sl in prs.slides:
        for sh in sl.shapes:
            if sh.has_text_frame:
                parts.append(sh.text_frame.text)
            if sh.has_table:
                tb = sh.table
                for r in range(len(tb.rows)):
                    for c in range(len(tb.columns)):
                        parts.append(tb.cell(r, c).text)
    return re.sub(r"\s+", " ", " ".join(parts))
 def test_golden_build_y_render_pdf_pptx():
    prof = _profile()
    ch = build_analisis_llm(prof, {})
    assert ch is not None
    assert ch.id == "analisis_llm"
    assert ch.version == CHAPTER_VERSION
    assert ch.blocks  # non-empty.
    with tempfile.TemporaryDirectory() as d:
        out_pdf = os.path.join(d, "eda.pdf")
        res = render_automatic_eda_pdf(prof, out_pdf, {"title": "EDA — ventas"})
        assert res["path"] == out_pdf and os.path.exists(out_pdf)
        ids = [c["id"] for c in res["chapters"]]
        assert "analisis_llm" in ids
        txt = _pdf_text(out_pdf)
        # The user's required content: summary, suggested analyses, cleaning.
        assert "SUMMARYTOKEN" in txt
        assert "ANALYSISTOKEN" in txt
        assert "CLEANTOKEN" in txt
        assert "DESCTOKEN" in txt  # data dictionary cell.
        out_pptx = os.path.join(d, "eda.pptx")
        res2 = render_automatic_eda_pptx(prof, out_pptx, {"title": "EDA — ventas"})
        assert res2["path"] == out_pptx and os.path.exists(out_pptx)
        ids2 = [c["id"] for c in res2["chapters"]]
        assert "analisis_llm" in ids2
        ptx = _pptx_text(out_pptx)
        assert "SUMMARYTOKEN" in ptx
        assert "ANALYSISTOKEN" in ptx
        assert "CLEANTOKEN" in ptx
        assert "DESCTOKEN" in ptx
 def test_orden_capitulo_junto_a_overview():
    chapters = build_document(_profile(), {})
    ids = [c.id for c in chapters]
    assert "overview" in ids and "analisis_llm" in ids
    # User requirement: the LLM chapter sits right after overview.
    assert ids.index("analisis_llm") == ids.index("overview") + 1
 def test_edge_sin_llm_devuelve_none():
    # No llm block at all.
    prof = {k: v for k, v in _profile().items() if k != "llm"}
    assert build_analisis_llm(prof, {}) is None
    # None / empty / malformed never raise and yield None.
    assert build_analisis_llm(None, None) is None
    assert build_analisis_llm({}, {}) is None
    assert build_analisis_llm({"llm": {}}, {}) is None
    assert build_analisis_llm({"llm": "not-a-dict"}, {}) is None
    # All-empty fields → omitted (no blocks).
    empty = {"llm": {"summary": "", "dictionary": [], "cleaning": [],
                     "analyses": [], "pii": [], "row_meaning": ""}}
    assert build_analisis_llm(empty, {}) is None
 def test_edge_llm_via_ctx_fallback():
    # The block may arrive in ctx instead of the profile.
    prof = {k: v for k, v in _profile().items() if k != "llm"}
    ctx = {"llm": {"summary": "Resumen via ctx CTXTOKEN."}}
    ch = build_analisis_llm(prof, ctx)
    assert ch is not None and ch.id == "analisis_llm"
 def test_anti_cortes_diccionario_largo_y_limpieza_larga():
    long_clean = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do "
                  "eiusmod tempor incididunt ut labore et dolore magna aliqua "
                  "reprehenderit voluptate velit esse cillum dolore")
    dictionary = [
        {"column": f"col_{i}",
         "description": f"Descripcion larga numero {i} con bastante texto para "
                        f"forzar el wrap dentro de la celda fila{i}",
         "business_meaning": f"Significado de negocio {i}", "unit": "u"}
        for i in range(40)
    ]
    prof = {
        "table": "t", "n_rows": 1, "n_cols": 1, "columns": [],
        "llm": {"summary": "S", "dictionary": dictionary,
                "cleaning": [long_clean], "analyses": ["A"]},
    }
    ch = build_analisis_llm(prof, {})
    assert ch is not None
    # Structure: the dictionary DataTable keeps ALL 40 rows — none dropped on
    # construction (the renderers then split it by rows, repeating the header).
    dts = [b for b in ch.blocks if isinstance(b, DataTable)]
    assert any(len(dt.rows) == 40 for dt in dts)
    with tempfile.TemporaryDirectory() as d:
        out_pdf = os.path.join(d, "x.pdf")
        render_automatic_eda_pdf([ch], out_pdf, {"write_manifest": False})
        # 40 wide rows + a long cleaning line cannot fit one page → it spills,
        # which is exactly the no-cut behaviour (paginate, never truncate).
        assert len(PdfReader(out_pdf).pages) > 1
        txt = _pdf_text(out_pdf)
        # The long cleaning suggestion is wrapped word-by-word, not truncated.
        for word in ("Lorem", "incididunt", "reprehenderit", "voluptate", "cillum"):
            assert word in txt
        out_pptx = os.path.join(d, "x.pptx")
        res2 = render_automatic_eda_pptx([ch], out_pptx, {"write_manifest": False})
        assert res2["n_slides"] > 1  # table + long text spill across slides.
        ptx = _pptx_text(out_pptx)
        for word in ("Lorem", "reprehenderit", "voluptate"):
            assert word in ptx