fix(eda): quita rótulos duplicados en capítulo ANÁLISIS LLM

El capítulo etiquetaba dos secciones por partida doble: un Heading de nivel 2 más el 'title' del propio DataTable, imprimiendo 'Diccionario de datos' y 'Datos personales (PII / RGPD)' dos veces seguidas en PDF y PPTX. Se elimina el 'title' de ambos DataTable y se conserva el Heading único (el patrón canónico OVERVIEW del contrato §8: el rótulo lo da el Heading, la tabla solo repite su cabecera de columnas al paginar). El DataTable de PII mantiene su 'note' orientativa. La columna del diccionario ya lee 'Significado de negocio'. CHAPTER_VERSION 1.0.0 -> 1.1.0. Test nuevo test_sin_rotulos_duplicados_y_significado_de_negocio fija: tablas sin title, cabecera exacta 'Significado de negocio', y cada rótulo una sola vez en el PDF. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:07:12 +02:00
4 changed files with 74 additions and 322 deletions
@@ -42,7 +42,11 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
+# no longer carry a ``title`` (the section Heading labels them once, per the
+# OVERVIEW pattern in the contract). The data-dictionary column already reads
+# "Significado de negocio".
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "analisis_llm"
 CHAPTER_TITLE = "Análisis LLM"

@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
    Columns: Columna / Descripción / Significado de negocio / Unidad. The
    paginator splits this by rows repeating the header and wraps long cells, so a
    long dictionary (many columns) never gets cut.
+
+    The block carries **no** ``title``: the section is labelled once by the
+    ``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
+    OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
+    print "Diccionario de datos" twice in a row.
    """
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)) or not entries:
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
        ])
    if not rows:
        return None
-    return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
+    return model.DataTable(header=header, rows=rows)


 def _analyses_blocks(llm: dict) -> list:
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:


 def _pii_block(llm: dict):
-    """DataTable for PII/GDPR findings, or None if absent/empty."""
+    """DataTable for PII/GDPR findings, or None if absent/empty.
+
+    Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
+    ``build_analisis_llm`` labels the section once); it keeps its ``note`` with
+    the orientative-detection caveat, which the renderers print under the table.
+    """
    entries = llm.get("pii")
    if not isinstance(entries, (list, tuple)) or not entries:
        return None
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
    if not rows:
        return None
    return model.DataTable(
-        header=header, rows=rows, title="Datos personales (PII / RGPD)",
+        header=header, rows=rows,
        note="detección automática orientativa — revisar antes de tratar los datos")


@@ -24,7 +24,7 @@ from pptx import Presentation
 from datascience.automatic_eda.chapters.analisis_llm import (
    build_analisis_llm, CHAPTER_VERSION)
 from datascience.automatic_eda.chapters_registry import build_document
-from datascience.automatic_eda.model import Chapter, DataTable
+from datascience.automatic_eda.model import Chapter, DataTable, Heading
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx

@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
        assert "DESCTOKEN" in ptx


+def test_sin_rotulos_duplicados_y_significado_de_negocio():
+    """The dictionary / PII sections must be labelled ONCE.
+
+    Regression for the duplicated 'Diccionario de datos' and 'Datos personales
+    (PII / RGPD)' headings (each section used to print its label twice: a Heading
+    plus the DataTable's own title). The fix drops the DataTable title and keeps
+    a single Heading — the OVERVIEW pattern. The data-dictionary column header is
+    also pinned to the exact text 'Significado de negocio'.
+    """
+    ch = build_analisis_llm(_profile(), {})
+    assert ch is not None
+
+    # Structure: section labels come from Headings; tables carry no title.
+    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
+    assert headings.count("Diccionario de datos") == 1
+    assert headings.count("Datos personales (PII / RGPD)") == 1
+    for b in ch.blocks:
+        if isinstance(b, DataTable):
+            assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
+
+    # The data dictionary's third column reads exactly 'Significado de negocio'.
+    dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
+    assert dicts, "expected the data-dictionary DataTable"
+    assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
+
+    # The PII table keeps its orientative-detection note.
+    pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
+    assert pii and pii[0].note and "orientativa" in pii[0].note
+
+    # Render: each label appears exactly once across the whole document (the only
+    # 'Diccionario de datos' / 'Datos personales' producer is this chapter).
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "eda.pdf")
+        render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
+        txt = _pdf_text(out_pdf)
+        assert txt.count("Diccionario de datos") == 1
+        assert txt.count("Datos personales") == 1
+
+
 def test_orden_capitulo_junto_a_overview():
    chapters = build_document(_profile(), {})
    ids = [c.id for c in chapters]
@@ -2,17 +2,8 @@

 Builds the document cover from a TableProfile plus an optional ``ctx`` of
 presentation metadata. Reads everything defensively (``.get``) and degrades
-honestly.
-
-The dataset size (N rows x M columns) is always shown big, as a heading right
-under the dataset name (kept together in a ``Group``), not buried in the
-metadata table. The Description and Granularity are resolved through a cascade
-so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
-(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
-``row_meaning``; otherwise a short summary is derived from the profile itself
-(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
-key-candidate columns or the table shape. Nothing is invented: the derived
-fallbacks state that they come from the profile.
+honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
+placeholder rather than invented, leaving a hook for the LLM layer to fill it.

 Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
    build_<id>(profile: dict, ctx: dict) -> Chapter | None
@@ -26,15 +17,10 @@ from datetime import datetime, timezone

 from .. import model

-CHAPTER_VERSION = "1.2.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "portada"
 CHAPTER_TITLE = "Portada"

-# Key under which eda_llm_insights stores its interpretive block in the profile.
-# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
-# row represents) from it when the LLM layer ran (``run_llm``).
-_LLM_KEY = "llm"
-
 # Default human description of what the table quality score measures. Chapters
 # can override it via ctx["quality_criteria"].
 _DEFAULT_QUALITY_CRITERIA = (
@@ -156,88 +142,6 @@ def _fmt_date_eu(value) -> str:
        return s


-def _llm_block(profile: dict, ctx: dict) -> dict:
-    """Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
-
-    It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
-    may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
-    not a dict degrades to an empty dict so the cover never raises.
-    """
-    block = profile.get(_LLM_KEY)
-    if not isinstance(block, dict):
-        block = ctx.get(_LLM_KEY)
-    return block if isinstance(block, dict) else {}
-
-
-def _count_column_types(profile: dict, ctx: dict):
-    """Best-effort (n_numeric, n_categorical) for the dataset.
-
-    Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
-    over the whole body); falls back to counting the profile columns directly so
-    the cover still has the numbers when no summary was passed.
-    """
-    summary = ctx.get("document_summary")
-    if isinstance(summary, dict):
-        n_num = summary.get("n_numeric")
-        n_cat = summary.get("n_categorical")
-        if n_num is not None or n_cat is not None:
-            return n_num, n_cat
-    cols = profile.get("columns") or []
-    n_num = sum(1 for c in cols if isinstance(c, dict)
-                and c.get("inferred_type") == "numeric")
-    n_cat = sum(1 for c in cols if isinstance(c, dict)
-                and isinstance(c.get("categorical"), dict)
-                and c.get("categorical", {}).get("top")
-                and c.get("inferred_type") != "numeric")
-    return n_num, n_cat
-
-
-def _derive_description(profile: dict, ctx: dict) -> str:
-    """A short, honest description of the dataset from the profile.
-
-    Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
-    available. Summarizes shape, column-type mix and quality score; never empty,
-    never invents business meaning (it states the description was derived)."""
-    n_rows = profile.get("n_rows")
-    n_cols = profile.get("n_cols")
-    n_num, n_cat = _count_column_types(profile, ctx)
-    head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
-    type_bits = []
-    if n_num:
-        type_bits.append(f"{_fmt_int(n_num)} numéricas")
-    if n_cat:
-        type_bits.append(f"{_fmt_int(n_cat)} categóricas")
-    if type_bits:
-        head += " (" + ", ".join(type_bits) + ")"
-    parts = [head + "."]
-    score = profile.get("quality_score")
-    if score is not None:
-        parts.append(f"Calidad media estimada: {score}/100.")
-    parts.append(
-        "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
-        "para una descripción de negocio más rica.")
-    return " ".join(parts)
-
-
-def _derive_granularity(profile: dict, dataset_name: str) -> str:
-    """A ``Cada fila es…`` granularity sentence from the profile.
-
-    Prefers the key-candidate columns (a row is identified by them); when no key
-    is detected, falls back to the table shape so the line is always meaningful
-    and starts with ``Cada fila es`` as the user requested."""
-    keys = profile.get("key_candidates") or []
-    if keys:
-        shown = ", ".join(str(k) for k in keys[:3])
-        more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
-        return (f"Cada fila es un registro identificado por {shown}{more}, "
-                "candidata(s) a clave por ser únicas y sin nulos.")
-    n_rows = profile.get("n_rows")
-    tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
-    return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
-            "columna identificadora única, así que la granularidad se infiere "
-            "de la forma de la tabla." + tail)
-
-
 def build_portada(profile: dict, ctx: dict):
    """Build the cover Chapter, or None if there is truly nothing to show."""
    profile = profile or {}
@@ -262,38 +166,30 @@ def build_portada(profile: dict, ctx: dict):
    quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
    quality_value = "—" if score is None else f"{score} / 100"

-    llm = _llm_block(profile, ctx)
-
-    # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
-    # candidates; finally a shape-based fallback. Always a real "Cada fila es…".
+    # Granularity: ctx wins; else derive from key candidates; else be honest.
    granularity = ctx.get("granularity")
    if not granularity:
-        granularity = (llm.get("row_meaning") or "").strip() or None
-    if not granularity:
-        granularity = _derive_granularity(profile, str(dataset_name))
+        keys = profile.get("key_candidates") or []
+        if keys:
+            granularity = ("Cada fila parece identificada por "
+                           + ", ".join(str(k) for k in keys[:3]) + ".")
+        else:
+            granularity = ("Cada fila es… (granularidad no determinada — "
+                           "pendiente de la capa de cálculo/LLM).")

-    # Description: explicit ctx wins; then the LLM "summary"; finally a short
-    # profile-derived summary. Never the old empty placeholder.
    description = ctx.get("description")
    if not description:
-        description = (llm.get("summary") or "").strip() or None
-    if not description:
-        description = _derive_description(profile, ctx)
-
-    # Title + dataset size shown together and BIG (Heading) at the top, kept on
-    # the same page (Group). The size is no longer buried in the metadata table.
-    cover = [
-        model.Heading(text=str(dataset_name), level=1),
-        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
-        model.Heading(text=shape, level=2),
-    ]
+        description = ("Descripción no provista — pendiente de la capa LLM "
+                       "(`run_llm`) o de `ctx['description']`.")

    blocks = [
-        model.Group(blocks=cover),
+        model.Heading(text=str(dataset_name), level=1),
+        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
        model.KVTable(rows=[
            ("Fuente", source_origin),
            ("Almacenamiento", storage),
            ("Generado", when),
+            ("Tamaño", shape),
            ("Calidad", quality_value),
            ("Criterios de calidad", quality_criteria),
        ]),
@@ -1,197 +0,0 @@
-"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
-
-Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
-and deterministic. Verifies the Fase 4b improvements:
-
-1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
-   heading kept together with the dataset name in a ``Group`` — and is no longer
-   a row of the metadata table.
-2. Description and Granularity are resolved through a real cascade and are never
-   the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
-   block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
-   short summary is derived from the profile and a "Cada fila es…" sentence from
-   the key-candidate columns or the table shape.
-3. The chapter degrades without raising on empty/None input.
-4. It renders inside the full document to both PDF and PPTX showing that content.
-"""
-
-import os
-import re
-import tempfile
-
-from pypdf import PdfReader
-from pptx import Presentation
-
-from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
-from datascience.automatic_eda.chapters.portada import (
-    CHAPTER_ID, CHAPTER_VERSION, build_portada,
-)
-from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
-from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
-
-
-def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
-    prof = {
-        "table": "titanic",
-        "source": "/data/titanic.csv",
-        "profiled_at": "2026-06-30T10:00:00+00:00",
-        "n_rows": 891,
-        "n_cols": 12,
-        "quality_score": 78.0,
-        "columns": [
-            {"name": "PassengerId", "inferred_type": "numeric",
-             "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
-                                          "max": 891.0, "std": 257.0}},
-            {"name": "Survived", "inferred_type": "numeric",
-             "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
-                                          "max": 1.0, "std": 0.49}},
-            {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
-             "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
-                                     {"value": "female", "count": 314,
-                                      "pct": 0.35}],
-                             "mode": "male", "n_distinct": 2, "entropy": 0.93}},
-        ],
-    }
-    if with_keys:
-        prof["key_candidates"] = ["PassengerId"]
-    if with_llm:
-        prof["llm"] = {
-            "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
-            "row_meaning": "Cada fila es un pasajero del Titanic.",
-            "dictionary": [], "pii": [], "cleaning": [], "analyses": [],
-        }
-    return prof
-
-
-def _pdf_text(path: str) -> str:
-    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
-    return re.sub(r"\s+", " ", txt)
-
-
-def _pptx_text(path: str) -> str:
-    prs = Presentation(path)
-    parts = []
-    for sl in prs.slides:
-        for sh in sl.shapes:
-            if sh.has_text_frame:
-                parts.append(sh.text_frame.text)
-            if sh.has_table:
-                tb = sh.table
-                for r in range(len(tb.rows)):
-                    for c in range(len(tb.columns)):
-                        parts.append(tb.cell(r, c).text)
-    return re.sub(r"\s+", " ", " ".join(parts))
-
-
-def _markdown_after(blocks, heading_text):
-    """Return the Markdown block that follows a Heading whose text matches."""
-    for i, b in enumerate(blocks):
-        if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
-            for nb in blocks[i + 1:]:
-                if isinstance(nb, Markdown):
-                    return nb
-    return None
-
-
-def test_golden_tamano_grande_y_textos_llm():
-    ch = build_portada(_profile(), {})
-    assert ch is not None
-    assert ch.id == CHAPTER_ID
-    assert ch.version == CHAPTER_VERSION
-
-    # 1) Title + size kept together in a Group; size is a BIG level-2 heading.
-    group = next(b for b in ch.blocks if isinstance(b, Group))
-    inner = group.blocks
-    assert isinstance(inner[0], Heading) and inner[0].level == 1
-    assert inner[0].text == "titanic"
-    size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
-    assert "891" in size_h.text and "12" in size_h.text
-    assert "filas" in size_h.text and "columnas" in size_h.text
-
-    # 2) Size is no longer a row of the metadata table.
-    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
-    labels = [r[0] for r in kv.rows]
-    assert "Tamaño" not in labels
-    assert "Fuente" in labels and "Calidad" in labels
-
-    # 3) Description and Granularity come from the LLM block.
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert desc is not None and "Titanic" in desc.text
-    assert gran is not None and gran.text.startswith("Cada fila es")
-    assert "pasajero" in gran.text.lower()
-
-
-def test_fallback_sin_llm_usa_keys_y_perfil():
-    # No LLM block: description derived from the profile, granularity from keys.
-    ch = build_portada(_profile(with_llm=False, with_keys=True), {})
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    # Description is the derived summary, never the old "pendiente" placeholder.
-    assert "pendiente" not in desc.text.lower()
-    assert "891" in desc.text and "columnas" in desc.text
-    assert "numéricas" in desc.text or "categóricas" in desc.text
-    # Granularity mentions the key candidate and starts with "Cada fila es".
-    assert gran.text.startswith("Cada fila es")
-    assert "PassengerId" in gran.text
-    assert "…" not in gran.text  # the old ellipsis placeholder is gone.
-
-
-def test_fallback_sin_llm_sin_keys_usa_forma():
-    ch = build_portada(_profile(with_llm=False, with_keys=False), {})
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert gran.text.startswith("Cada fila es")
-    assert "titanic" in gran.text.lower()
-    assert "pendiente" not in gran.text.lower()
-
-
-def test_ctx_explicito_gana_sobre_llm():
-    ctx = {"description": "Descripción manual.",
-           "granularity": "Cada fila es una unidad manual."}
-    ch = build_portada(_profile(), ctx)
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert desc.text == "Descripción manual."
-    assert gran.text == "Cada fila es una unidad manual."
-
-
-def test_edge_perfil_vacio_no_lanza():
-    # Empty / None never raise; the cover still shows a size and real texts.
-    for prof, ctx in (({}, {}), (None, None)):
-        ch = build_portada(prof, ctx)
-        assert ch is not None
-        group = next(b for b in ch.blocks if isinstance(b, Group))
-        size_h = next(b for b in group.blocks
-                      if isinstance(b, Heading) and b.level == 2)
-        assert "filas" in size_h.text and "columnas" in size_h.text
-        desc = _markdown_after(ch.blocks, "Descripción")
-        gran = _markdown_after(ch.blocks, "Granularidad")
-        assert desc.text and "pendiente" not in desc.text.lower()
-        assert gran.text.startswith("Cada fila es")
-
-
-def test_golden_render_pdf_muestra_portada():
-    prof = _profile()
-    with tempfile.TemporaryDirectory() as d:
-        out = os.path.join(d, "eda.pdf")
-        res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
-        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
-        txt = _pdf_text(out)
-        assert "titanic" in txt.lower()
-        assert "891" in txt and "filas" in txt and "columnas" in txt
-        assert "Titanic" in txt          # LLM summary in the Description.
-        assert "Cada fila es" in txt     # granularity sentence.
-
-
-def test_golden_render_pptx_muestra_portada():
-    prof = _profile()
-    with tempfile.TemporaryDirectory() as d:
-        out = os.path.join(d, "eda.pptx")
-        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
-        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
-        txt = _pptx_text(out)
-        assert "titanic" in txt.lower()
-        assert "891" in txt and "columnas" in txt
-        assert "Cada fila es" in txt