fix(eda): quita rótulos duplicados en capítulo ANÁLISIS LLM

El capítulo etiquetaba dos secciones por partida doble: un Heading de nivel 2 más el 'title' del propio DataTable, imprimiendo 'Diccionario de datos' y 'Datos personales (PII / RGPD)' dos veces seguidas en PDF y PPTX. Se elimina el 'title' de ambos DataTable y se conserva el Heading único (el patrón canónico OVERVIEW del contrato §8: el rótulo lo da el Heading, la tabla solo repite su cabecera de columnas al paginar). El DataTable de PII mantiene su 'note' orientativa. La columna del diccionario ya lee 'Significado de negocio'. CHAPTER_VERSION 1.0.0 -> 1.1.0. Test nuevo test_sin_rotulos_duplicados_y_significado_de_negocio fija: tablas sin title, cabecera exacta 'Significado de negocio', y cada rótulo una sola vez en el PDF. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:07:12 +02:00
parent c6d9bc26da
commit 7045f37554
2 changed files with 58 additions and 5 deletions
@@ -42,7 +42,11 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
+# no longer carry a ``title`` (the section Heading labels them once, per the
+# OVERVIEW pattern in the contract). The data-dictionary column already reads
+# "Significado de negocio".
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "analisis_llm"
 CHAPTER_TITLE = "Análisis LLM"

@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
    Columns: Columna / Descripción / Significado de negocio / Unidad. The
    paginator splits this by rows repeating the header and wraps long cells, so a
    long dictionary (many columns) never gets cut.
+
+    The block carries **no** ``title``: the section is labelled once by the
+    ``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
+    OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
+    print "Diccionario de datos" twice in a row.
    """
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)) or not entries:
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
        ])
    if not rows:
        return None
-    return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
+    return model.DataTable(header=header, rows=rows)


 def _analyses_blocks(llm: dict) -> list:
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:


 def _pii_block(llm: dict):
-    """DataTable for PII/GDPR findings, or None if absent/empty."""
+    """DataTable for PII/GDPR findings, or None if absent/empty.
+
+    Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
+    ``build_analisis_llm`` labels the section once); it keeps its ``note`` with
+    the orientative-detection caveat, which the renderers print under the table.
+    """
    entries = llm.get("pii")
    if not isinstance(entries, (list, tuple)) or not entries:
        return None
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
    if not rows:
        return None
    return model.DataTable(
-        header=header, rows=rows, title="Datos personales (PII / RGPD)",
+        header=header, rows=rows,
        note="detección automática orientativa — revisar antes de tratar los datos")


@@ -24,7 +24,7 @@ from pptx import Presentation
 from datascience.automatic_eda.chapters.analisis_llm import (
    build_analisis_llm, CHAPTER_VERSION)
 from datascience.automatic_eda.chapters_registry import build_document
-from datascience.automatic_eda.model import Chapter, DataTable
+from datascience.automatic_eda.model import Chapter, DataTable, Heading
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx

@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
        assert "DESCTOKEN" in ptx


+def test_sin_rotulos_duplicados_y_significado_de_negocio():
+    """The dictionary / PII sections must be labelled ONCE.
+
+    Regression for the duplicated 'Diccionario de datos' and 'Datos personales
+    (PII / RGPD)' headings (each section used to print its label twice: a Heading
+    plus the DataTable's own title). The fix drops the DataTable title and keeps
+    a single Heading — the OVERVIEW pattern. The data-dictionary column header is
+    also pinned to the exact text 'Significado de negocio'.
+    """
+    ch = build_analisis_llm(_profile(), {})
+    assert ch is not None
+
+    # Structure: section labels come from Headings; tables carry no title.
+    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
+    assert headings.count("Diccionario de datos") == 1
+    assert headings.count("Datos personales (PII / RGPD)") == 1
+    for b in ch.blocks:
+        if isinstance(b, DataTable):
+            assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
+
+    # The data dictionary's third column reads exactly 'Significado de negocio'.
+    dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
+    assert dicts, "expected the data-dictionary DataTable"
+    assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
+
+    # The PII table keeps its orientative-detection note.
+    pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
+    assert pii and pii[0].note and "orientativa" in pii[0].note
+
+    # Render: each label appears exactly once across the whole document (the only
+    # 'Diccionario de datos' / 'Datos personales' producer is this chapter).
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "eda.pdf")
+        render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
+        txt = _pdf_text(out_pdf)
+        assert txt.count("Diccionario de datos") == 1
+        assert txt.count("Datos personales") == 1
+
+
 def test_orden_capitulo_junto_a_overview():
    chapters = build_document(_profile(), {})
    ids = [c.id for c in chapters]