test(eda): tests del capítulo ANÁLISIS LLM (golden + edges + anti-cortes)

Suite self-contained (perfil sintético + un golden, sin DuckDB): - golden: build_analisis_llm devuelve el Chapter y el documento entero renderiza a PDF y PPTX con resumen, análisis sugeridos, limpieza y una columna del diccionario presentes. - orden: el capítulo queda inmediatamente después de `overview`. - edges: profile sin bloque `llm` (o None/{}/malformado/llm vacío) -> None sin lanzar; fallback a ctx['llm']. - anti-cortes: diccionario de 40 filas + sugerencia de limpieza de ~150 chars se reparten en varias páginas/slides sin perder ninguna fila ni palabra. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
feat(eda): capítulo ANÁLISIS LLM para AutomaticEDA, junto al overview
2026-06-30 15:01:26 +02:00 · 2026-06-30 15:01:26 +02:00
5 changed files with 412 additions and 528 deletions
@@ -0,0 +1,221 @@
+"""LLM analysis chapter (ANÁLISIS LLM) — the interpretive layer, next to overview.
+
+Third reference chapter for AutomaticEDA. Renders the ``llm`` block that the
+``eda`` group function ``eda_llm_insights`` already produced and stored in the
+``TableProfile`` — it does NOT call the LLM nor recompute anything. The block is
+turned into clean, markdown-style document blocks so it reads as a real chapter
+(table summary, row meaning, data dictionary, suggested analyses, cleaning
+suggestions, PII findings) and, crucially, **nothing is ever cut** in PDF or
+PPTX:
+
+* Prose (summary, row meaning) → ``Markdown`` blocks the renderers wrap to whole
+  lines, so no word is lost no matter how long the text is.
+* The data dictionary and PII findings → ``DataTable`` blocks the paginator
+  splits by rows (repeating the header) and whose long cells wrap inside their
+  column — wide, multi-row tables never overflow a page/slide.
+* Cleaning suggestions and suggested analyses → ``Markdown`` bullet lists; each
+  item is a whole line the renderer wraps, never truncated mid-entry.
+
+Position: this chapter is declared in ``chapters_registry.CHAPTER_ORDER`` right
+after ``overview`` so the interpretation sits next to the table preview, as the
+user asked ("va junto al overview").
+
+Data source: the ``llm`` dict produced by ``eda_llm_insights`` (group ``eda``),
+read from ``profile['llm']`` (or ``ctx['llm']`` as a fallback). Shape::
+
+    {
+      "summary": str,            # what the table is, 2-3 sentences
+      "row_meaning": str,        # what one row represents / granularity
+      "dictionary": [ {"column","description","business_meaning","unit"} ],
+      "pii": [ {"column","kind","severity"} ],
+      "cleaning": [str],         # cleaning / transformation suggestions
+      "analyses": [str],         # suggested questions / analyses / hypotheses
+    }
+
+Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
+Reads everything defensively (``.get``) and NEVER raises; returns ``None`` when
+the profile carries no LLM block (e.g. ``profile_table`` ran without
+``run_llm``), so the chapter is simply omitted from the document.
+"""
+
+from __future__ import annotations
+
+from .. import model
+
+CHAPTER_VERSION = "1.0.0"
+CHAPTER_ID = "analisis_llm"
+CHAPTER_TITLE = "Análisis LLM"
+
+# Key under which eda_llm_insights stores its interpretive block in the profile.
+LLM_KEY = "llm"
+
+
+def _clean_text(value) -> str:
+    """Coerce a value to a single trimmed line (collapse inner newlines).
+
+    Used for bullet items so each suggestion stays a single markdown bullet the
+    renderer wraps; never drops content, only normalizes whitespace.
+    """
+    text = model._safe_str(value).strip()
+    if not text:
+        return ""
+    return " ".join(text.split())
+
+
+def _para(value) -> str:
+    """Coerce a value to trimmed prose, preserving paragraph breaks."""
+    text = model._safe_str(value).strip()
+    if not text:
+        return ""
+    # Keep blank-line paragraph breaks; collapse runs of spaces/tabs per line.
+    lines = [" ".join(ln.split()) for ln in text.splitlines()]
+    out: list = []
+    for ln in lines:
+        if ln or (out and out[-1] != ""):
+            out.append(ln)
+    return "\n".join(out).strip()
+
+
+def _bullets(items) -> str:
+    """Build a markdown bullet list from a sequence of strings.
+
+    Each item becomes one ``- ...`` line (a whole, wrappable unit). Empty items
+    and non-list inputs are handled gracefully; returns "" when there is nothing.
+    """
+    if isinstance(items, str):
+        items = [items]
+    if not isinstance(items, (list, tuple)):
+        return ""
+    lines = []
+    for it in items:
+        text = _clean_text(it)
+        if text:
+            lines.append(f"- {text}")
+    return "\n".join(lines)
+
+
+def _summary_blocks(llm: dict) -> list:
+    """Heading + prose for the table summary, or [] if absent."""
+    text = _para(llm.get("summary"))
+    if not text:
+        return []
+    return [model.Heading(text="Resumen de la tabla", level=2),
+            model.Markdown(text=text)]
+
+
+def _row_meaning_blocks(llm: dict) -> list:
+    """Heading + prose for what one row represents, or [] if absent."""
+    text = _para(llm.get("row_meaning"))
+    if not text:
+        return []
+    return [model.Heading(text="Significado de una fila", level=2),
+            model.Markdown(text=text)]
+
+
+def _dictionary_block(llm: dict):
+    """DataTable for the data dictionary, or None if absent/empty.
+
+    Columns: Columna / Descripción / Significado de negocio / Unidad. The
+    paginator splits this by rows repeating the header and wraps long cells, so a
+    long dictionary (many columns) never gets cut.
+    """
+    entries = llm.get("dictionary")
+    if not isinstance(entries, (list, tuple)) or not entries:
+        return None
+    header = ["Columna", "Descripción", "Significado de negocio", "Unidad"]
+    rows = []
+    for e in entries:
+        if not isinstance(e, dict):
+            # Be tolerant: a bare string still shows up as a description row.
+            rows.append(["—", _clean_text(e), "", ""])
+            continue
+        rows.append([
+            _clean_text(e.get("column")) or "—",
+            _clean_text(e.get("description")),
+            _clean_text(e.get("business_meaning")),
+            _clean_text(e.get("unit")),
+        ])
+    if not rows:
+        return None
+    return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
+
+
+def _analyses_blocks(llm: dict) -> list:
+    """Heading + bullet list of suggested analyses, or [] if absent."""
+    bullets = _bullets(llm.get("analyses"))
+    if not bullets:
+        return []
+    return [model.Heading(text="Análisis sugeridos", level=2),
+            model.Markdown(text=bullets)]
+
+
+def _cleaning_blocks(llm: dict) -> list:
+    """Heading + bullet list of cleaning suggestions, or [] if absent."""
+    bullets = _bullets(llm.get("cleaning"))
+    if not bullets:
+        return []
+    return [model.Heading(text="Limpieza sugerida", level=2),
+            model.Markdown(text=bullets)]
+
+
+def _pii_block(llm: dict):
+    """DataTable for PII/GDPR findings, or None if absent/empty."""
+    entries = llm.get("pii")
+    if not isinstance(entries, (list, tuple)) or not entries:
+        return None
+    header = ["Columna", "Tipo", "Severidad"]
+    rows = []
+    for e in entries:
+        if not isinstance(e, dict):
+            continue
+        rows.append([
+            _clean_text(e.get("column")) or "—",
+            _clean_text(e.get("kind")),
+            _clean_text(e.get("severity")),
+        ])
+    if not rows:
+        return None
+    return model.DataTable(
+        header=header, rows=rows, title="Datos personales (PII / RGPD)",
+        note="detección automática orientativa — revisar antes de tratar los datos")
+
+
+def build_analisis_llm(profile: dict, ctx: dict):
+    """Build the LLM analysis Chapter, or None if there is no LLM block.
+
+    Consumes ``profile['llm']`` (the block produced by ``eda_llm_insights``,
+    group ``eda``); falls back to ``ctx['llm']``. Returns ``None`` when no LLM
+    block is present or it carries no usable content, so the chapter is omitted
+    rather than rendering an empty section.
+    """
+    profile = profile or {}
+    ctx = ctx or {}
+
+    llm = profile.get(LLM_KEY)
+    if not isinstance(llm, dict):
+        llm = ctx.get(LLM_KEY)
+    if not isinstance(llm, dict) or not llm:
+        return None
+
+    blocks: list = []
+    blocks += _summary_blocks(llm)
+    blocks += _row_meaning_blocks(llm)
+
+    dict_block = _dictionary_block(llm)
+    if dict_block is not None:
+        blocks.append(model.Heading(text="Diccionario de datos", level=2))
+        blocks.append(dict_block)
+
+    blocks += _analyses_blocks(llm)
+    blocks += _cleaning_blocks(llm)
+
+    pii_block = _pii_block(llm)
+    if pii_block is not None:
+        blocks.append(model.Heading(text="Datos personales (PII / RGPD)", level=2))
+        blocks.append(pii_block)
+
+    if not blocks:
+        return None  # LLM block present but every field empty → omit chapter.
+
+    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,190 @@
+"""Tests for the ANÁLISIS LLM chapter — DoD: golden + edges + anti-cut.
+
+Self-contained: builds a synthetic TableProfile carrying an ``llm`` block (the
+shape ``eda_llm_insights`` produces) so the suite is fast and deterministic — no
+DuckDB and no LLM call. Verifies:
+
+* golden — ``build_analisis_llm`` yields the chapter and the full document
+  renders to PDF *and* PPTX with the summary, a suggested analysis, a cleaning
+  suggestion and a dictionary column all present;
+* order — the chapter sits immediately after ``overview`` (user requirement);
+* edges — a profile with no ``llm`` block (or None/empty/malformed) returns
+  ``None`` and never raises;
+* anti-cut — a long dictionary (40 rows) and a 150-char cleaning suggestion are
+  rendered to PDF and PPTX without losing a single row or word.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.chapters.analisis_llm import (
+    build_analisis_llm, CHAPTER_VERSION)
+from datascience.automatic_eda.chapters_registry import build_document
+from datascience.automatic_eda.model import Chapter, DataTable
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _profile() -> dict:
+    return {
+        "table": "ventas",
+        "source": "/data/ventas.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 1000,
+        "n_cols": 2,
+        "quality_score": 92.5,
+        "columns": [
+            {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
+             "null_count": 0,
+             "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0,
+                         "max": 100.0, "std": 12.3}},
+            {"name": "categoria", "inferred_type": "categorical",
+             "null_pct": 0.0, "null_count": 0,
+             "categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
+        ],
+        "llm": {
+            "summary": "Tabla de ventas por producto. Token SUMMARYTOKEN.",
+            "row_meaning": "Cada fila es una venta. Token ROWTOKEN.",
+            "dictionary": [
+                {"column": "precio", "description": "Precio unitario DESCTOKEN",
+                 "business_meaning": "Ingreso por unidad", "unit": "EUR"},
+                {"column": "categoria", "description": "Familia de producto",
+                 "business_meaning": "Segmento comercial", "unit": ""},
+            ],
+            "pii": [{"column": "categoria", "kind": "ninguno", "severity": "low"}],
+            "cleaning": ["Quitar nulos de precio CLEANTOKEN",
+                         "Normalizar mayusculas en categoria"],
+            "analyses": ["Estudiar relacion precio-categoria ANALYSISTOKEN",
+                         "Detectar outliers de precio"],
+        },
+    }
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def test_golden_build_y_render_pdf_pptx():
+    prof = _profile()
+    ch = build_analisis_llm(prof, {})
+    assert ch is not None
+    assert ch.id == "analisis_llm"
+    assert ch.version == CHAPTER_VERSION
+    assert ch.blocks  # non-empty.
+
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(prof, out_pdf, {"title": "EDA — ventas"})
+        assert res["path"] == out_pdf and os.path.exists(out_pdf)
+        ids = [c["id"] for c in res["chapters"]]
+        assert "analisis_llm" in ids
+        txt = _pdf_text(out_pdf)
+        # The user's required content: summary, suggested analyses, cleaning.
+        assert "SUMMARYTOKEN" in txt
+        assert "ANALYSISTOKEN" in txt
+        assert "CLEANTOKEN" in txt
+        assert "DESCTOKEN" in txt  # data dictionary cell.
+
+        out_pptx = os.path.join(d, "eda.pptx")
+        res2 = render_automatic_eda_pptx(prof, out_pptx, {"title": "EDA — ventas"})
+        assert res2["path"] == out_pptx and os.path.exists(out_pptx)
+        ids2 = [c["id"] for c in res2["chapters"]]
+        assert "analisis_llm" in ids2
+        ptx = _pptx_text(out_pptx)
+        assert "SUMMARYTOKEN" in ptx
+        assert "ANALYSISTOKEN" in ptx
+        assert "CLEANTOKEN" in ptx
+        assert "DESCTOKEN" in ptx
+
+
+def test_orden_capitulo_junto_a_overview():
+    chapters = build_document(_profile(), {})
+    ids = [c.id for c in chapters]
+    assert "overview" in ids and "analisis_llm" in ids
+    # User requirement: the LLM chapter sits right after overview.
+    assert ids.index("analisis_llm") == ids.index("overview") + 1
+
+
+def test_edge_sin_llm_devuelve_none():
+    # No llm block at all.
+    prof = {k: v for k, v in _profile().items() if k != "llm"}
+    assert build_analisis_llm(prof, {}) is None
+    # None / empty / malformed never raise and yield None.
+    assert build_analisis_llm(None, None) is None
+    assert build_analisis_llm({}, {}) is None
+    assert build_analisis_llm({"llm": {}}, {}) is None
+    assert build_analisis_llm({"llm": "not-a-dict"}, {}) is None
+    # All-empty fields → omitted (no blocks).
+    empty = {"llm": {"summary": "", "dictionary": [], "cleaning": [],
+                     "analyses": [], "pii": [], "row_meaning": ""}}
+    assert build_analisis_llm(empty, {}) is None
+
+
+def test_edge_llm_via_ctx_fallback():
+    # The block may arrive in ctx instead of the profile.
+    prof = {k: v for k, v in _profile().items() if k != "llm"}
+    ctx = {"llm": {"summary": "Resumen via ctx CTXTOKEN."}}
+    ch = build_analisis_llm(prof, ctx)
+    assert ch is not None and ch.id == "analisis_llm"
+
+
+def test_anti_cortes_diccionario_largo_y_limpieza_larga():
+    long_clean = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do "
+                  "eiusmod tempor incididunt ut labore et dolore magna aliqua "
+                  "reprehenderit voluptate velit esse cillum dolore")
+    dictionary = [
+        {"column": f"col_{i}",
+         "description": f"Descripcion larga numero {i} con bastante texto para "
+                        f"forzar el wrap dentro de la celda fila{i}",
+         "business_meaning": f"Significado de negocio {i}", "unit": "u"}
+        for i in range(40)
+    ]
+    prof = {
+        "table": "t", "n_rows": 1, "n_cols": 1, "columns": [],
+        "llm": {"summary": "S", "dictionary": dictionary,
+                "cleaning": [long_clean], "analyses": ["A"]},
+    }
+    ch = build_analisis_llm(prof, {})
+    assert ch is not None
+    # Structure: the dictionary DataTable keeps ALL 40 rows — none dropped on
+    # construction (the renderers then split it by rows, repeating the header).
+    dts = [b for b in ch.blocks if isinstance(b, DataTable)]
+    assert any(len(dt.rows) == 40 for dt in dts)
+
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "x.pdf")
+        render_automatic_eda_pdf([ch], out_pdf, {"write_manifest": False})
+        # 40 wide rows + a long cleaning line cannot fit one page → it spills,
+        # which is exactly the no-cut behaviour (paginate, never truncate).
+        assert len(PdfReader(out_pdf).pages) > 1
+        txt = _pdf_text(out_pdf)
+        # The long cleaning suggestion is wrapped word-by-word, not truncated.
+        for word in ("Lorem", "incididunt", "reprehenderit", "voluptate", "cillum"):
+            assert word in txt
+
+        out_pptx = os.path.join(d, "x.pptx")
+        res2 = render_automatic_eda_pptx([ch], out_pptx, {"write_manifest": False})
+        assert res2["n_slides"] > 1  # table + long text spill across slides.
+        ptx = _pptx_text(out_pptx)
+        for word in ("Lorem", "reprehenderit", "voluptate"):
+            assert word in ptx
@@ -1,352 +0,0 @@
-"""Correlation chapter — association matrix plus top positive/negative pairs.
-
-Builds the CORRELACION chapter of an AutomaticEDA document from a TableProfile.
-It renders exactly what the user asked for:
-
-1. A correlation/association **matrix** (heatmap) reconstructed from the evaluated
-   pairs, signed for numeric-numeric pairs (Pearson/Spearman, ``[-1, 1]``) and as
-   magnitude for the mixed-type metrics (Cramér's V, correlation ratio, mutual
-   information, ``[0, 1]``). Labels are ordered by total connectivity so strong
-   associations cluster together instead of being scattered alphabetically.
-2. The **TOP positive** pairs and the **TOP negative** pairs as two separate
-   tables. Only numeric-numeric metrics carry a sign, so negative pairs are by
-   construction Pearson/Spearman; positive pairs may use any method.
-3. The methods legend and the multiple-testing (FDR) summary, so the reader sees
-   how many pairs survive the correction.
-4. A spuriousness caveat when the profile flags level-based correlations on
-   non-stationary series (Granger–Newbold).
-
-All data comes from ``profile['correlations']`` — the output of the ``eda`` group
-function ``association_matrix`` (optionally enriched by ``profile_table``). The
-chapter never recomputes any statistic; it only lays the existing values out as
-format-independent blocks. The renderers paginate tables (repeating the header)
-and scale the heatmap to fit entirely, so nothing is ever cut.
-
-Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
-"""
-
-from __future__ import annotations
-
-import math
-
-from .. import model
-
-CHAPTER_VERSION = "1.0.0"
-CHAPTER_ID = "correlacion"
-CHAPTER_TITLE = "Correlación"
-
-# Methods whose value carries a sign (direction). Everything else is a magnitude
-# in [0, 1] and therefore only ever contributes to the positive side.
-_SIGNED_METHODS = ("pearson", "spearman")
-
-# Cap the heatmap to the most-connected variables so it stays legible on a phone
-# screen / a slide. The renderer would scale a bigger matrix to fit, but the
-# cells become unreadable; we instead show the top-N and say so.
-_MAX_MATRIX_LABELS = 16
-
-# How many pairs to show in each of the top-positive / top-negative tables.
-_TOP_N = 10
-
-
-def _is_num(v) -> bool:
-    """True for a real, finite int/float (not bool, not NaN/inf)."""
-    return (
-        isinstance(v, (int, float))
-        and not isinstance(v, bool)
-        and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
-    )
-
-
-def _fmt_val(value, decimals: int = 2) -> str:
-    """Format an association value compactly, signed, with a fixed width feel."""
-    if not _is_num(value):
-        return "—"
-    text = f"{float(value):+.{decimals}f}"
-    # Strip a trailing -0.00 / +0.00 into a clean 0.00 for readability.
-    if text in ("+0.00", "-0.00"):
-        return "0.00"
-    return text
-
-
-def _fmt_p(value) -> str:
-    """Format an adjusted p-value; tiny values collapse to a '<' threshold."""
-    if not _is_num(value):
-        return "—"
-    p = float(value)
-    if p < 0.001:
-        return "<0.001"
-    return f"{p:.3f}"
-
-
-def _is_signed(pair: dict) -> bool:
-    """True if the pair's method reports a directional (signed) value."""
-    method = str(pair.get("method") or "").lower()
-    return any(m in method for m in _SIGNED_METHODS)
-
-
-def _significant(pair: dict) -> bool:
-    """True if the pair is significant after FDR (or has no test to correct)."""
-    if pair.get("significant") is True:
-        return True
-    # Pairs without an applicable test (p_value None) are not penalised: they are
-    # admitted on magnitude alone upstream, so treat missing as "not rejected".
-    return pair.get("p_value") is None and pair.get("significant") is None
-
-
-def _label(pair: dict) -> str:
-    """Human label for a pair, e.g. 'alcohol ↔ density'."""
-    return f"{model._safe_str(pair.get('a'))} ↔ {model._safe_str(pair.get('b'))}"
-
-
-def _split_top(pairs: list, top_n: int = _TOP_N):
-    """Split evaluated pairs into ranked top-positive and top-negative lists.
-
-    Positive: any pair with a positive value, ranked by value descending.
-    Negative: only signed (numeric-numeric) pairs with a negative value, ranked
-    by value ascending (most negative first). Non-finite values are dropped.
-    """
-    positive = []
-    negative = []
-    for pair in pairs:
-        if not isinstance(pair, dict):
-            continue
-        value = pair.get("value")
-        if not _is_num(value):
-            continue
-        if value > 0:
-            positive.append(pair)
-        elif value < 0 and _is_signed(pair):
-            negative.append(pair)
-    positive.sort(key=lambda p: float(p.get("value", 0.0)), reverse=True)
-    negative.sort(key=lambda p: float(p.get("value", 0.0)))
-    return positive[:top_n], negative[:top_n]
-
-
-def _top_table(pairs: list, title: str):
-    """Build a DataTable for a list of pairs, or None if there are none."""
-    if not pairs:
-        return None
-    header = ["Par", "Método", "Valor", "p (FDR)", "Sig."]
-    rows = []
-    for pair in pairs:
-        method = model._safe_str(pair.get("method")) or "—"
-        rows.append([
-            _label(pair),
-            method,
-            _fmt_val(pair.get("value")),
-            _fmt_p(pair.get("p_value_adjusted")),
-            "sí" if _significant(pair) else "no",
-        ])
-    return model.DataTable(header=header, rows=rows, title=title)
-
-
-def _ordered_labels(pairs: list):
-    """Pick and order the matrix labels by total connectivity (descending).
-
-    Returns the list of variable names to place on the axes, capped at
-    ``_MAX_MATRIX_LABELS`` (the most-connected ones), plus a boolean saying
-    whether the cap trimmed anything.
-    """
-    strength = {}
-    for pair in pairs:
-        if not isinstance(pair, dict):
-            continue
-        value = pair.get("value")
-        if not _is_num(value):
-            continue
-        mag = abs(float(value))
-        for key in ("a", "b"):
-            name = pair.get(key)
-            if name is None:
-                continue
-            strength[name] = strength.get(name, 0.0) + mag
-    if not strength:
-        return [], False
-    ordered = sorted(strength, key=lambda n: strength[n], reverse=True)
-    trimmed = len(ordered) > _MAX_MATRIX_LABELS
-    return ordered[:_MAX_MATRIX_LABELS], trimmed
-
-
-def _matrix_figure(pairs: list, labels: list):
-    """Return a Figure (lazy) with the signed association heatmap, or None.
-
-    The matplotlib figure is built lazily inside ``make`` so importing this
-    module never requires matplotlib and a malformed plot degrades to nothing
-    instead of aborting the chapter.
-    """
-    if len(labels) < 2:
-        return None
-
-    index = {name: i for i, name in enumerate(labels)}
-
-    def make():
-        import numpy as np
-        from matplotlib.figure import Figure
-
-        n = len(labels)
-        grid = np.full((n, n), np.nan, dtype=float)
-        for i in range(n):
-            grid[i, i] = 1.0
-        for pair in pairs:
-            if not isinstance(pair, dict):
-                continue
-            a = pair.get("a")
-            b = pair.get("b")
-            value = pair.get("value")
-            if a not in index or b not in index or not _is_num(value):
-                continue
-            v = float(value)
-            # Mixed-type magnitudes are non-negative; keep them as-is on [0, 1].
-            ia, ib = index[a], index[b]
-            grid[ia, ib] = v
-            grid[ib, ia] = v
-
-        import matplotlib
-
-        masked = np.ma.masked_invalid(grid)
-        fig = Figure(figsize=(6.2, 5.6))
-        ax = fig.add_subplot(111)
-        cmap = matplotlib.colormaps["RdBu_r"].copy()
-        cmap.set_bad(color="#eeeeee")
-        im = ax.imshow(masked, cmap=cmap, vmin=-1.0, vmax=1.0, aspect="auto")
-        ax.set_xticks(range(n))
-        ax.set_yticks(range(n))
-        short = [str(s)[:14] for s in labels]
-        ax.set_xticks(range(n))
-        ax.set_xticklabels(short, rotation=90, fontsize=7)
-        ax.set_yticklabels(short, fontsize=7)
-        # Annotate cells only when the matrix is small enough to stay legible.
-        if n <= 8:
-            for i in range(n):
-                for j in range(n):
-                    cell = grid[i, j]
-                    if _is_num(cell):
-                        ax.text(j, i, f"{cell:+.2f}".replace("+", "") if cell < 0
-                                else f"{cell:.2f}",
-                                ha="center", va="center", fontsize=6,
-                                color="#222222")
-        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04,
-                     label="asociación (signo en num-num)")
-        fig.tight_layout()
-        return fig
-
-    return model.Figure(make=make,
-                        caption="Matriz de asociación. Azul = positiva, rojo = "
-                                "negativa (sólo num-num lleva signo); gris = par "
-                                "no evaluado.")
-
-
-def _methods_block(corr: dict):
-    """Build a KVTable with the legend of the methods actually present."""
-    legend = corr.get("methods_legend")
-    if not isinstance(legend, dict) or not legend:
-        return None
-    rows = [(model._safe_str(k), model._safe_str(v)) for k, v in legend.items()]
-    return model.KVTable(rows=rows, title="Métodos de asociación")
-
-
-def _fdr_text(corr: dict) -> str | None:
-    """One-line summary of the multiple-testing (FDR) correction, or None."""
-    mt = corr.get("multiple_testing")
-    if not isinstance(mt, dict) or not mt:
-        return None
-    method = model._safe_str(mt.get("method")).upper() or "FDR"
-    alpha = mt.get("alpha")
-    n_tests = mt.get("n_tests")
-    n_rej = mt.get("n_rejected")
-    parts = [f"Corrección por comparaciones múltiples ({method}"]
-    if _is_num(alpha):
-        parts[0] += f", α={float(alpha):g}"
-    parts[0] += ")."
-    if _is_num(n_tests):
-        rej = n_rej if _is_num(n_rej) else "—"
-        parts.append(
-            f"De {int(n_tests)} pares con test, {rej} siguen siendo "
-            f"significativos tras la corrección.")
-    return " ".join(parts)
-
-
-def build_correlacion(profile: dict, ctx: dict):
-    """Build the Correlation Chapter, or None if there are no pairs to show.
-
-    Reads ``profile['correlations']`` (the ``association_matrix`` output). Returns
-    ``None`` when the dataset has fewer than two associable columns (no evaluated
-    pairs), so the chapter is omitted instead of showing an empty section. Never
-    raises: every access is defensive.
-
-    ctx keys consumed: none specific (presentation metadata is inherited from the
-    document). The chapter reads everything it needs from the profile.
-    """
-    profile = profile or {}
-    ctx = ctx or {}
-
-    corr = profile.get("correlations")
-    if not isinstance(corr, dict):
-        return None
-    pairs = corr.get("pairs")
-    if not isinstance(pairs, list) or not pairs:
-        return None
-
-    blocks: list = []
-
-    # Intro: what this chapter shows and how to read the sign.
-    blocks.append(model.Markdown(text=(
-        "Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
-        "sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
-        "entre categóricas; razón de correlación num-categórica; información mutua "
-        "como medida común no lineal). Sólo las correlaciones **num-num** tienen "
-        "dirección: por eso los pares **negativos** son siempre num-num.")))
-
-    # 1) Association matrix (heatmap).
-    labels, trimmed = _ordered_labels(pairs)
-    fig = _matrix_figure(pairs, labels)
-    if fig is not None:
-        blocks.append(model.Heading(text="Matriz de asociación", level=2))
-        blocks.append(fig)
-        if trimmed:
-            blocks.append(model.Note(text=(
-                f"Se muestran las {len(labels)} variables más conectadas de la "
-                "matriz para mantenerla legible; el resto de pares siguen en las "
-                "tablas de abajo.")))
-
-    # 2) Top positive / top negative pairs.
-    positive, negative = _split_top(pairs, _TOP_N)
-    pos_table = _top_table(positive, f"Top {len(positive)} positivas")
-    neg_table = _top_table(negative, f"Top {len(negative)} negativas")
-    if pos_table is not None:
-        blocks.append(model.Heading(text="Pares más correlacionados (positivos)",
-                                    level=2))
-        blocks.append(pos_table)
-    if neg_table is not None:
-        blocks.append(model.Heading(text="Pares más correlacionados (negativos)",
-                                    level=2))
-        blocks.append(neg_table)
-    elif pos_table is not None:
-        # No signed-negative pairs at all: say so honestly rather than omit.
-        blocks.append(model.Note(text=(
-            "No se han hallado correlaciones negativas significativas entre "
-            "columnas numéricas.")))
-
-    # 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
-    caveat = corr.get("levels_caveat")
-    if isinstance(caveat, str) and caveat.strip():
-        blocks.append(model.Note(text=caveat.strip()))
-    elif corr.get("levels_possible_spurious"):
-        blocks.append(model.Note(text=(
-            "Aviso: algunas correlaciones se calcularon sobre niveles de series "
-            "no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
-            "sobre los retornos/diferencias antes de interpretarlas.")))
-
-    # 4) FDR summary + methods legend.
-    fdr_text = _fdr_text(corr)
-    if fdr_text:
-        blocks.append(model.Markdown(text=fdr_text))
-    methods = _methods_block(corr)
-    if methods is not None:
-        blocks.append(model.Heading(text="Métodos y leyenda", level=2))
-        blocks.append(methods)
-
-    if not blocks:
-        return None
-    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
-                         version=CHAPTER_VERSION, blocks=blocks)
@@ -1,175 +0,0 @@
-"""Tests for the CORRELACION chapter — DoD: golden + edges + error/anti-cut.
-
-Self-contained: builds a synthetic TableProfile carrying a ``correlations`` block
-shaped exactly like ``association_matrix`` output (no DuckDB), so the suite is
-fast and deterministic. Verifies that the chapter emits the association-matrix
-figure plus separate top-positive / top-negative tables with the right pairs,
-that it returns None when the profile has no pairs, that a None/empty profile
-does not raise, and that a wide matrix with long labels renders to PDF *and* PPTX
-without cutting anything.
-"""
-
-import os
-import re
-import tempfile
-
-from pypdf import PdfReader
-
-from datascience.automatic_eda.chapters.correlacion import (
-    CHAPTER_VERSION,
-    build_correlacion,
-)
-from datascience.automatic_eda.model import DataTable, Figure
-from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
-from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
-
-
-def _pair(a, b, value, method, padj, sig, p=0.0001):
-    return {
-        "a": a, "b": b, "a_type": "numeric", "b_type": "numeric",
-        "method": method, "value": value, "extra": {"mi": abs(value) * 0.5},
-        "p_value": p, "p_value_adjusted": padj, "significant": sig,
-    }
-
-
-def _profile() -> dict:
-    """Synthetic wine-like profile with signed and unsigned associations."""
-    pairs = [
-        _pair("alcohol", "quality", 0.48, "pearson/spearman", 0.0005, True),
-        _pair("density", "alcohol", -0.78, "pearson/spearman", 0.0001, True),
-        _pair("ph", "fixed_acidity", -0.68, "pearson/spearman", 0.0002, True),
-        _pair("sulphates", "quality", 0.25, "pearson/spearman", 0.03, True),
-        # Unsigned mixed-type metrics: only ever positive, never in the neg table.
-        {"a": "region", "b": "type", "a_type": "categorical",
-         "b_type": "categorical", "method": "cramers_v", "value": 0.55,
-         "extra": {"mi": 0.3}, "p_value": 0.001, "p_value_adjusted": 0.004,
-         "significant": True},
-    ]
-    return {
-        "table": "wine",
-        "source": "/data/wine.csv",
-        "n_rows": 1599,
-        "n_cols": 12,
-        "correlations": {
-            "pairs": pairs,
-            "strong": [p for p in pairs if abs(p["value"]) >= 0.5],
-            "methods_legend": {
-                "pearson": "num-num lineal (Pearson r), [-1, 1]",
-                "cramers_v": "cat-cat simétrica (Cramér's V), [0, 1]",
-            },
-            "multiple_testing": {"method": "bh", "alpha": 0.05,
-                                 "n_tests": 5, "n_rejected": 5},
-        },
-    }
-
-
-def _pdf_text(path: str) -> str:
-    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
-    return re.sub(r"\s+", " ", txt)
-
-
-def test_golden_chapter_tiene_matriz_y_top_positivos_y_negativos():
-    ch = build_correlacion(_profile(), {})
-    assert ch is not None
-    assert ch.id == "correlacion"
-    assert ch.version == CHAPTER_VERSION
-    kinds = [b.kind for b in ch.blocks]
-    assert "figure" in kinds  # association matrix heatmap.
-    figs = [b for b in ch.blocks if isinstance(b, Figure)]
-    assert figs and figs[0].make is not None  # lazy figure.
-
-    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
-    assert len(tables) >= 2  # top positive + top negative.
-    flat = " ".join(str(c) for t in tables for r in t.rows for c in r)
-    # Strongest positive present and signed +, strongest negative present and -.
-    assert "alcohol" in flat and "quality" in flat
-    assert "+0.48" in flat
-    assert "density" in flat and "-0.78" in flat
-
-
-def test_golden_render_pdf_y_pptx_muestran_lo_exigido():
-    prof = _profile()
-    with tempfile.TemporaryDirectory() as d:
-        pdf = os.path.join(d, "corr.pdf")
-        pptx = os.path.join(d, "corr.pptx")
-        rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine"})
-        rx = render_automatic_eda_pptx(prof, pptx, {"title": "EDA — wine"})
-        assert rp["path"] == pdf and rp["n_pages"] >= 1
-        assert rx["path"] == pptx and rx["n_slides"] >= 1
-        assert "correlacion" in [c["id"] for c in rp["chapters"]]
-        assert "correlacion" in [c["id"] for c in rx["chapters"]]
-        txt = _pdf_text(pdf)
-        # The requirement: matrix + top positive/negative pairs, all visible.
-        assert "Correlaci" in txt  # chapter title (accents may vary in extract).
-        assert "density" in txt and "alcohol" in txt and "quality" in txt
-        assert "0.78" in txt and "0.48" in txt
-        # Both signs surfaced as separate sections.
-        assert "positiv" in txt.lower() and "negativ" in txt.lower()
-
-
-def test_edge_sin_pares_devuelve_none():
-    # No correlations key, empty pairs, and wrong types all yield None, not error.
-    assert build_correlacion({"table": "x"}, {}) is None
-    assert build_correlacion({"correlations": {}}, {}) is None
-    assert build_correlacion({"correlations": {"pairs": []}}, {}) is None
-    assert build_correlacion({"correlations": {"pairs": "nope"}}, {}) is None
-    assert build_correlacion(None, None) is None
-    assert build_correlacion({}, {}) is None
-
-
-def test_edge_solo_positivos_emite_nota_sin_tabla_negativa():
-    prof = {
-        "correlations": {
-            "pairs": [
-                _pair("a", "b", 0.6, "pearson/spearman", 0.001, True),
-                {"a": "c", "b": "d", "a_type": "categorical",
-                 "b_type": "categorical", "method": "cramers_v", "value": 0.7,
-                 "extra": {"mi": 0.4}, "p_value": 0.001,
-                 "p_value_adjusted": 0.003, "significant": True},
-            ],
-        },
-    }
-    ch = build_correlacion(prof, {})
-    assert ch is not None
-    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
-    assert len(tables) == 1  # only the positive table.
-    notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
-    assert "negativas" in notes  # honest "no negative correlations" note.
-
-
-def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
-    # 20 numeric vars with long names -> matrix trimmed to top-N + both renderers
-    # must lay the chapter out without raising and keep a long label intact.
-    long_a = "concentracion_de_dioxido_de_azufre_libre"
-    long_b = "concentracion_de_dioxido_de_azufre_total"
-    pairs = [_pair(long_a, long_b, -0.72, "pearson/spearman", 0.0001, True)]
-    for i in range(20):
-        pairs.append(_pair(f"variable_numerica_larga_{i:02d}",
-                           f"variable_numerica_larga_{(i + 1) % 20:02d}",
-                           0.55 - i * 0.02, "pearson/spearman", 0.01, True))
-    prof = {"correlations": {"pairs": pairs,
-                             "multiple_testing": {"method": "bh", "alpha": 0.05,
-                                                  "n_tests": len(pairs),
-                                                  "n_rejected": len(pairs)}}}
-    ch = build_correlacion(prof, {})
-    assert ch is not None
-    # A "showing top-N most connected" note appears when the matrix is trimmed.
-    notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
-    assert "más conectadas" in notes
-    # Anti-cut guarantee at the block level: the long pair reaches the renderer
-    # whole (the block never truncates); the renderer then wraps the cell inside
-    # its column. Both long labels are present, intact, in a table cell.
-    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
-    cells = [str(c) for t in tables for r in t.rows for c in r]
-    assert any(long_a in c and long_b in c for c in cells)
-    with tempfile.TemporaryDirectory() as d:
-        pdf = os.path.join(d, "wide.pdf")
-        pptx = os.path.join(d, "wide.pptx")
-        rp = render_automatic_eda_pdf(prof, pdf, {"write_manifest": False})
-        rx = render_automatic_eda_pptx(prof, pptx, {"write_manifest": False})
-        # Both renderers lay the wide chapter out without raising and produce a
-        # non-empty document (nothing dropped, just wrapped/scaled to fit).
-        assert rp["path"] == pdf and os.path.exists(pdf) and rp["n_pages"] >= 1
-        assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
-        # A short, unbreakable fragment of the long label survives the wrap.
-        assert "azufre" in _pdf_text(pdf)
@@ -28,12 +28,12 @@ from . import model
 CHAPTER_ORDER = [
    "portada",       # cover
    "overview",      # df.head + columns/types/nulls/examples + describe
+    "analisis_llm",  # LLM interpretation — sits next to overview (user request)
    "num_distr",     # numeric distributions
    "cat_distr",     # categorical distributions
    "calidad",       # data quality
    "correlacion",   # correlations / associations
    "modelos",       # cheap models (PCA/KMeans/outliers)
-    "analisis_llm",  # LLM interpretation
    "timeseries",    # time-series analysis
    "geospatial",    # geospatial
    "agregacion",    # aggregations / pivots